Home | History | Annotate | Download | only in gallivm
      1 /**************************************************************************
      2  *
      3  * Copyright 2009 VMware, Inc.
      4  * All Rights Reserved.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the
      8  * "Software"), to deal in the Software without restriction, including
      9  * without limitation the rights to use, copy, modify, merge, publish,
     10  * distribute, sub license, and/or sell copies of the Software, and to
     11  * permit persons to whom the Software is furnished to do so, subject to
     12  * the following conditions:
     13  *
     14  * The above copyright notice and this permission notice (including the
     15  * next paragraph) shall be included in all copies or substantial portions
     16  * of the Software.
     17  *
     18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
     21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
     22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     25  *
     26  **************************************************************************/
     27 
     28 /**
     29  * @file
     30  * Texture sampling -- common code.
     31  *
     32  * @author Jose Fonseca <jfonseca (at) vmware.com>
     33  */
     34 
     35 #include "pipe/p_defines.h"
     36 #include "pipe/p_state.h"
     37 #include "util/u_format.h"
     38 #include "util/u_math.h"
     39 #include "util/u_cpu_detect.h"
     40 #include "lp_bld_arit.h"
     41 #include "lp_bld_const.h"
     42 #include "lp_bld_debug.h"
     43 #include "lp_bld_printf.h"
     44 #include "lp_bld_flow.h"
     45 #include "lp_bld_sample.h"
     46 #include "lp_bld_swizzle.h"
     47 #include "lp_bld_type.h"
     48 #include "lp_bld_logic.h"
     49 #include "lp_bld_pack.h"
     50 #include "lp_bld_quad.h"
     51 #include "lp_bld_bitarit.h"
     52 
     53 
     54 /*
     55  * Bri-linear factor. Should be greater than one.
     56  */
     57 #define BRILINEAR_FACTOR 2
     58 
     59 /**
     60  * Does the given texture wrap mode allow sampling the texture border color?
     61  * XXX maybe move this into gallium util code.
     62  */
     63 boolean
     64 lp_sampler_wrap_mode_uses_border_color(unsigned mode,
     65                                        unsigned min_img_filter,
     66                                        unsigned mag_img_filter)
     67 {
     68    switch (mode) {
     69    case PIPE_TEX_WRAP_REPEAT:
     70    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
     71    case PIPE_TEX_WRAP_MIRROR_REPEAT:
     72    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
     73       return FALSE;
     74    case PIPE_TEX_WRAP_CLAMP:
     75    case PIPE_TEX_WRAP_MIRROR_CLAMP:
     76       if (min_img_filter == PIPE_TEX_FILTER_NEAREST &&
     77           mag_img_filter == PIPE_TEX_FILTER_NEAREST) {
     78          return FALSE;
     79       } else {
     80          return TRUE;
     81       }
     82    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
     83    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
     84       return TRUE;
     85    default:
     86       assert(0 && "unexpected wrap mode");
     87       return FALSE;
     88    }
     89 }
     90 
     91 
     92 /**
     93  * Initialize lp_sampler_static_texture_state object with the gallium
     94  * texture/sampler_view state (this contains the parts which are
     95  * considered static).
     96  */
     97 void
     98 lp_sampler_static_texture_state(struct lp_static_texture_state *state,
     99                                 const struct pipe_sampler_view *view)
    100 {
    101    const struct pipe_resource *texture;
    102 
    103    memset(state, 0, sizeof *state);
    104 
    105    if (!view || !view->texture)
    106       return;
    107 
    108    texture = view->texture;
    109 
    110    state->format            = view->format;
    111    state->swizzle_r         = view->swizzle_r;
    112    state->swizzle_g         = view->swizzle_g;
    113    state->swizzle_b         = view->swizzle_b;
    114    state->swizzle_a         = view->swizzle_a;
    115 
    116    state->target            = view->target;
    117    state->pot_width         = util_is_power_of_two(texture->width0);
    118    state->pot_height        = util_is_power_of_two(texture->height0);
    119    state->pot_depth         = util_is_power_of_two(texture->depth0);
    120    state->level_zero_only   = !view->u.tex.last_level;
    121 
    122    /*
    123     * the layer / element / level parameters are all either dynamic
    124     * state or handled transparently wrt execution.
    125     */
    126 }
    127 
    128 
    129 /**
    130  * Initialize lp_sampler_static_sampler_state object with the gallium sampler
    131  * state (this contains the parts which are considered static).
    132  */
    133 void
    134 lp_sampler_static_sampler_state(struct lp_static_sampler_state *state,
    135                                 const struct pipe_sampler_state *sampler)
    136 {
    137    memset(state, 0, sizeof *state);
    138 
    139    if (!sampler)
    140       return;
    141 
    142    /*
    143     * We don't copy sampler state over unless it is actually enabled, to avoid
    144     * spurious recompiles, as the sampler static state is part of the shader
    145     * key.
    146     *
    147     * Ideally the state tracker or cso_cache module would make all state
    148     * canonical, but until that happens it's better to be safe than sorry here.
    149     *
    150     * XXX: Actually there's much more than can be done here, especially
    151     * regarding 1D/2D/3D/CUBE textures, wrap modes, etc.
    152     */
    153 
    154    state->wrap_s            = sampler->wrap_s;
    155    state->wrap_t            = sampler->wrap_t;
    156    state->wrap_r            = sampler->wrap_r;
    157    state->min_img_filter    = sampler->min_img_filter;
    158    state->mag_img_filter    = sampler->mag_img_filter;
    159    state->min_mip_filter    = sampler->min_mip_filter;
    160    state->seamless_cube_map = sampler->seamless_cube_map;
    161 
    162    if (sampler->max_lod > 0.0f) {
    163       state->max_lod_pos = 1;
    164    }
    165 
    166    if (sampler->lod_bias != 0.0f) {
    167       state->lod_bias_non_zero = 1;
    168    }
    169 
    170    if (state->min_mip_filter != PIPE_TEX_MIPFILTER_NONE ||
    171        state->min_img_filter != state->mag_img_filter) {
    172 
    173       /* If min_lod == max_lod we can greatly simplify mipmap selection.
    174        * This is a case that occurs during automatic mipmap generation.
    175        */
    176       if (sampler->min_lod == sampler->max_lod) {
    177          state->min_max_lod_equal = 1;
    178       } else {
    179          if (sampler->min_lod > 0.0f) {
    180             state->apply_min_lod = 1;
    181          }
    182 
    183          /*
    184           * XXX this won't do anything with the mesa state tracker which always
    185           * sets max_lod to not more than actually present mip maps...
    186           */
    187          if (sampler->max_lod < (PIPE_MAX_TEXTURE_LEVELS - 1)) {
    188             state->apply_max_lod = 1;
    189          }
    190       }
    191    }
    192 
    193    state->compare_mode      = sampler->compare_mode;
    194    if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) {
    195       state->compare_func   = sampler->compare_func;
    196    }
    197 
    198    state->normalized_coords = sampler->normalized_coords;
    199 }
    200 
    201 
    202 /**
    203  * Generate code to compute coordinate gradient (rho).
    204  * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
    205  *
    206  * The resulting rho has bld->levelf format (per quad or per element).
    207  */
    208 static LLVMValueRef
    209 lp_build_rho(struct lp_build_sample_context *bld,
    210              unsigned texture_unit,
    211              LLVMValueRef s,
    212              LLVMValueRef t,
    213              LLVMValueRef r,
    214              LLVMValueRef cube_rho,
    215              const struct lp_derivatives *derivs)
    216 {
    217    struct gallivm_state *gallivm = bld->gallivm;
    218    struct lp_build_context *int_size_bld = &bld->int_size_in_bld;
    219    struct lp_build_context *float_size_bld = &bld->float_size_in_bld;
    220    struct lp_build_context *float_bld = &bld->float_bld;
    221    struct lp_build_context *coord_bld = &bld->coord_bld;
    222    struct lp_build_context *rho_bld = &bld->lodf_bld;
    223    const unsigned dims = bld->dims;
    224    LLVMValueRef ddx_ddy[2] = {NULL};
    225    LLVMBuilderRef builder = bld->gallivm->builder;
    226    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
    227    LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
    228    LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0);
    229    LLVMValueRef index2 = LLVMConstInt(i32t, 2, 0);
    230    LLVMValueRef rho_vec;
    231    LLVMValueRef int_size, float_size;
    232    LLVMValueRef rho;
    233    LLVMValueRef first_level, first_level_vec;
    234    unsigned length = coord_bld->type.length;
    235    unsigned num_quads = length / 4;
    236    boolean rho_per_quad = rho_bld->type.length != length;
    237    boolean no_rho_opt = bld->no_rho_approx && (dims > 1);
    238    unsigned i;
    239    LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
    240    LLVMValueRef rho_xvec, rho_yvec;
    241 
    242    /* Note that all simplified calculations will only work for isotropic filtering */
    243 
    244    /*
    245     * rho calcs are always per quad except for explicit derivs (excluding
    246     * the messy cube maps for now) when requested.
    247     */
    248 
    249    first_level = bld->dynamic_state->first_level(bld->dynamic_state, bld->gallivm,
    250                                                  bld->context_ptr, texture_unit);
    251    first_level_vec = lp_build_broadcast_scalar(int_size_bld, first_level);
    252    int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec, TRUE);
    253    float_size = lp_build_int_to_float(float_size_bld, int_size);
    254 
    255    if (cube_rho) {
    256       LLVMValueRef cubesize;
    257       LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
    258 
    259       /*
    260        * Cube map code did already everything except size mul and per-quad extraction.
    261        * Luckily cube maps are always quadratic!
    262        */
    263       if (rho_per_quad) {
    264          rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
    265                                          rho_bld->type, cube_rho, 0);
    266       }
    267       else {
    268          rho = lp_build_swizzle_scalar_aos(coord_bld, cube_rho, 0, 4);
    269       }
    270       /* Could optimize this for single quad just skip the broadcast */
    271       cubesize = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
    272                                             rho_bld->type, float_size, index0);
    273       /* skipping sqrt hence returning rho squared */
    274       cubesize = lp_build_mul(rho_bld, cubesize, cubesize);
    275       rho = lp_build_mul(rho_bld, cubesize, rho);
    276    }
    277    else if (derivs) {
    278       LLVMValueRef ddmax[3], ddx[3], ddy[3];
    279       for (i = 0; i < dims; i++) {
    280          LLVMValueRef floatdim;
    281          LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
    282 
    283          floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
    284                                                coord_bld->type, float_size, indexi);
    285 
    286          /*
    287           * note that for rho_per_quad case could reduce math (at some shuffle
    288           * cost), but for now use same code to per-pixel lod case.
    289           */
    290          if (no_rho_opt) {
    291             ddx[i] = lp_build_mul(coord_bld, floatdim, derivs->ddx[i]);
    292             ddy[i] = lp_build_mul(coord_bld, floatdim, derivs->ddy[i]);
    293             ddx[i] = lp_build_mul(coord_bld, ddx[i], ddx[i]);
    294             ddy[i] = lp_build_mul(coord_bld, ddy[i], ddy[i]);
    295          }
    296          else {
    297             LLVMValueRef tmpx, tmpy;
    298             tmpx = lp_build_abs(coord_bld, derivs->ddx[i]);
    299             tmpy = lp_build_abs(coord_bld, derivs->ddy[i]);
    300             ddmax[i] = lp_build_max(coord_bld, tmpx, tmpy);
    301             ddmax[i] = lp_build_mul(coord_bld, floatdim, ddmax[i]);
    302          }
    303       }
    304       if (no_rho_opt) {
    305          rho_xvec = lp_build_add(coord_bld, ddx[0], ddx[1]);
    306          rho_yvec = lp_build_add(coord_bld, ddy[0], ddy[1]);
    307          if (dims > 2) {
    308             rho_xvec = lp_build_add(coord_bld, rho_xvec, ddx[2]);
    309             rho_yvec = lp_build_add(coord_bld, rho_yvec, ddy[2]);
    310          }
    311          rho = lp_build_max(coord_bld, rho_xvec, rho_yvec);
    312          /* skipping sqrt hence returning rho squared */
    313      }
    314       else {
    315          rho = ddmax[0];
    316          if (dims > 1) {
    317             rho = lp_build_max(coord_bld, rho, ddmax[1]);
    318             if (dims > 2) {
    319                rho = lp_build_max(coord_bld, rho, ddmax[2]);
    320             }
    321          }
    322       }
    323       if (rho_per_quad) {
    324          /*
    325           * rho_vec contains per-pixel rho, convert to scalar per quad.
    326           */
    327          rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
    328                                          rho_bld->type, rho, 0);
    329       }
    330    }
    331    else {
    332       /*
    333        * This looks all a bit complex, but it's not that bad
    334        * (the shuffle code makes it look worse than it is).
    335        * Still, might not be ideal for all cases.
    336        */
    337       static const unsigned char swizzle0[] = { /* no-op swizzle */
    338          0, LP_BLD_SWIZZLE_DONTCARE,
    339          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
    340       };
    341       static const unsigned char swizzle1[] = {
    342          1, LP_BLD_SWIZZLE_DONTCARE,
    343          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
    344       };
    345       static const unsigned char swizzle2[] = {
    346          2, LP_BLD_SWIZZLE_DONTCARE,
    347          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
    348       };
    349 
    350       if (dims < 2) {
    351          ddx_ddy[0] = lp_build_packed_ddx_ddy_onecoord(coord_bld, s);
    352       }
    353       else if (dims >= 2) {
    354          ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
    355          if (dims > 2) {
    356             ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
    357          }
    358       }
    359 
    360       if (no_rho_opt) {
    361          static const unsigned char swizzle01[] = { /* no-op swizzle */
    362             0, 1,
    363             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
    364          };
    365          static const unsigned char swizzle23[] = {
    366             2, 3,
    367             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
    368          };
    369          LLVMValueRef ddx_ddys, ddx_ddyt, floatdim, shuffles[LP_MAX_VECTOR_LENGTH / 4];
    370 
    371          for (i = 0; i < num_quads; i++) {
    372             shuffles[i*4+0] = shuffles[i*4+1] = index0;
    373             shuffles[i*4+2] = shuffles[i*4+3] = index1;
    374          }
    375          floatdim = LLVMBuildShuffleVector(builder, float_size, float_size,
    376                                            LLVMConstVector(shuffles, length), "");
    377          ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], floatdim);
    378          ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
    379          ddx_ddys = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01);
    380          ddx_ddyt = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23);
    381          rho_vec = lp_build_add(coord_bld, ddx_ddys, ddx_ddyt);
    382 
    383          if (dims > 2) {
    384             static const unsigned char swizzle02[] = {
    385                0, 2,
    386                LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
    387             };
    388             floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
    389                                                   coord_bld->type, float_size, index2);
    390             ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], floatdim);
    391             ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
    392             ddx_ddy[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
    393             rho_vec = lp_build_add(coord_bld, rho_vec, ddx_ddy[1]);
    394          }
    395 
    396          rho_xvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
    397          rho_yvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
    398          rho = lp_build_max(coord_bld, rho_xvec, rho_yvec);
    399 
    400          if (rho_per_quad) {
    401             rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
    402                                             rho_bld->type, rho, 0);
    403          }
    404          else {
    405             rho = lp_build_swizzle_scalar_aos(coord_bld, rho, 0, 4);
    406          }
    407          /* skipping sqrt hence returning rho squared */
    408       }
    409       else {
    410          ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
    411          if (dims > 2) {
    412             ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
    413          }
    414          else {
    415             ddx_ddy[1] = NULL; /* silence compiler warning */
    416          }
    417 
    418          if (dims < 2) {
    419             rho_xvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle0);
    420             rho_yvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle2);
    421          }
    422          else if (dims == 2) {
    423             static const unsigned char swizzle02[] = {
    424                0, 2,
    425                LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
    426             };
    427             static const unsigned char swizzle13[] = {
    428                1, 3,
    429                LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
    430             };
    431             rho_xvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle02);
    432             rho_yvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle13);
    433          }
    434          else {
    435             LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH];
    436             LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH];
    437             assert(dims == 3);
    438             for (i = 0; i < num_quads; i++) {
    439                shuffles1[4*i + 0] = lp_build_const_int32(gallivm, 4*i);
    440                shuffles1[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 2);
    441                shuffles1[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i);
    442                shuffles1[4*i + 3] = i32undef;
    443                shuffles2[4*i + 0] = lp_build_const_int32(gallivm, 4*i + 1);
    444                shuffles2[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 3);
    445                shuffles2[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i + 2);
    446                shuffles2[4*i + 3] = i32undef;
    447             }
    448             rho_xvec = LLVMBuildShuffleVector(builder, ddx_ddy[0], ddx_ddy[1],
    449                                               LLVMConstVector(shuffles1, length), "");
    450             rho_yvec = LLVMBuildShuffleVector(builder, ddx_ddy[0], ddx_ddy[1],
    451                                               LLVMConstVector(shuffles2, length), "");
    452          }
    453 
    454          rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
    455 
    456          if (bld->coord_type.length > 4) {
    457             /* expand size to each quad */
    458             if (dims > 1) {
    459                /* could use some broadcast_vector helper for this? */
    460                LLVMValueRef src[LP_MAX_VECTOR_LENGTH/4];
    461                for (i = 0; i < num_quads; i++) {
    462                   src[i] = float_size;
    463                }
    464                float_size = lp_build_concat(bld->gallivm, src, float_size_bld->type, num_quads);
    465             }
    466             else {
    467                float_size = lp_build_broadcast_scalar(coord_bld, float_size);
    468             }
    469             rho_vec = lp_build_mul(coord_bld, rho_vec, float_size);
    470 
    471             if (dims <= 1) {
    472                rho = rho_vec;
    473             }
    474             else {
    475                if (dims >= 2) {
    476                   LLVMValueRef rho_s, rho_t, rho_r;
    477 
    478                   rho_s = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
    479                   rho_t = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
    480 
    481                   rho = lp_build_max(coord_bld, rho_s, rho_t);
    482 
    483                   if (dims >= 3) {
    484                      rho_r = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle2);
    485                      rho = lp_build_max(coord_bld, rho, rho_r);
    486                   }
    487                }
    488             }
    489             if (rho_per_quad) {
    490                rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
    491                                                rho_bld->type, rho, 0);
    492             }
    493             else {
    494                rho = lp_build_swizzle_scalar_aos(coord_bld, rho, 0, 4);
    495             }
    496          }
    497          else {
    498             if (dims <= 1) {
    499                rho_vec = LLVMBuildExtractElement(builder, rho_vec, index0, "");
    500             }
    501             rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size);
    502 
    503             if (dims <= 1) {
    504                rho = rho_vec;
    505             }
    506             else {
    507                if (dims >= 2) {
    508                   LLVMValueRef rho_s, rho_t, rho_r;
    509 
    510                   rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, "");
    511                   rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, "");
    512 
    513                   rho = lp_build_max(float_bld, rho_s, rho_t);
    514 
    515                   if (dims >= 3) {
    516                      rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, "");
    517                      rho = lp_build_max(float_bld, rho, rho_r);
    518                   }
    519                }
    520             }
    521             if (!rho_per_quad) {
    522                rho = lp_build_broadcast_scalar(rho_bld, rho);
    523             }
    524          }
    525       }
    526    }
    527 
    528    return rho;
    529 }
    530 
    531 
    532 /*
    533  * Bri-linear lod computation
    534  *
    535  * Use a piece-wise linear approximation of log2 such that:
    536  * - round to nearest, for values in the neighborhood of -1, 0, 1, 2, etc.
    537  * - linear approximation for values in the neighborhood of 0.5, 1.5., etc,
    538  *   with the steepness specified in 'factor'
    539  * - exact result for 0.5, 1.5, etc.
    540  *
    541  *
    542  *   1.0 -              /----*
    543  *                     /
    544  *                    /
    545  *                   /
    546  *   0.5 -          *
    547  *                 /
    548  *                /
    549  *               /
    550  *   0.0 - *----/
    551  *
    552  *         |                 |
    553  *        2^0               2^1
    554  *
    555  * This is a technique also commonly used in hardware:
    556  * - http://ixbtlabs.com/articles2/gffx/nv40-rx800-3.html
    557  *
    558  * TODO: For correctness, this should only be applied when texture is known to
    559  * have regular mipmaps, i.e., mipmaps derived from the base level.
    560  *
    561  * TODO: This could be done in fixed point, where applicable.
    562  */
    563 static void
    564 lp_build_brilinear_lod(struct lp_build_context *bld,
    565                        LLVMValueRef lod,
    566                        double factor,
    567                        LLVMValueRef *out_lod_ipart,
    568                        LLVMValueRef *out_lod_fpart)
    569 {
    570    LLVMValueRef lod_fpart;
    571    double pre_offset = (factor - 0.5)/factor - 0.5;
    572    double post_offset = 1 - factor;
    573 
    574    if (0) {
    575       lp_build_printf(bld->gallivm, "lod = %f\n", lod);
    576    }
    577 
    578    lod = lp_build_add(bld, lod,
    579                       lp_build_const_vec(bld->gallivm, bld->type, pre_offset));
    580 
    581    lp_build_ifloor_fract(bld, lod, out_lod_ipart, &lod_fpart);
    582 
    583    lod_fpart = lp_build_mad(bld, lod_fpart,
    584                             lp_build_const_vec(bld->gallivm, bld->type, factor),
    585                             lp_build_const_vec(bld->gallivm, bld->type, post_offset));
    586 
    587    /*
    588     * It's not necessary to clamp lod_fpart since:
    589     * - the above expression will never produce numbers greater than one.
    590     * - the mip filtering branch is only taken if lod_fpart is positive
    591     */
    592 
    593    *out_lod_fpart = lod_fpart;
    594 
    595    if (0) {
    596       lp_build_printf(bld->gallivm, "lod_ipart = %i\n", *out_lod_ipart);
    597       lp_build_printf(bld->gallivm, "lod_fpart = %f\n\n", *out_lod_fpart);
    598    }
    599 }
    600 
    601 
    602 /*
    603  * Combined log2 and brilinear lod computation.
    604  *
    605  * It's in all identical to calling lp_build_fast_log2() and
    606  * lp_build_brilinear_lod() above, but by combining we can compute the integer
    607  * and fractional part independently.
    608  */
    609 static void
    610 lp_build_brilinear_rho(struct lp_build_context *bld,
    611                        LLVMValueRef rho,
    612                        double factor,
    613                        LLVMValueRef *out_lod_ipart,
    614                        LLVMValueRef *out_lod_fpart)
    615 {
    616    LLVMValueRef lod_ipart;
    617    LLVMValueRef lod_fpart;
    618 
    619    const double pre_factor = (2*factor - 0.5)/(M_SQRT2*factor);
    620    const double post_offset = 1 - 2*factor;
    621 
    622    assert(bld->type.floating);
    623 
    624    assert(lp_check_value(bld->type, rho));
    625 
    626    /*
    627     * The pre factor will make the intersections with the exact powers of two
    628     * happen precisely where we want them to be, which means that the integer
    629     * part will not need any post adjustments.
    630     */
    631    rho = lp_build_mul(bld, rho,
    632                       lp_build_const_vec(bld->gallivm, bld->type, pre_factor));
    633 
    634    /* ipart = ifloor(log2(rho)) */
    635    lod_ipart = lp_build_extract_exponent(bld, rho, 0);
    636 
    637    /* fpart = rho / 2**ipart */
    638    lod_fpart = lp_build_extract_mantissa(bld, rho);
    639 
    640    lod_fpart = lp_build_mad(bld, lod_fpart,
    641                             lp_build_const_vec(bld->gallivm, bld->type, factor),
    642                             lp_build_const_vec(bld->gallivm, bld->type, post_offset));
    643 
    644    /*
    645     * Like lp_build_brilinear_lod, it's not necessary to clamp lod_fpart since:
    646     * - the above expression will never produce numbers greater than one.
    647     * - the mip filtering branch is only taken if lod_fpart is positive
    648     */
    649 
    650    *out_lod_ipart = lod_ipart;
    651    *out_lod_fpart = lod_fpart;
    652 }
    653 
    654 
    655 /**
    656  * Fast implementation of iround(log2(sqrt(x))), based on
    657  * log2(x^n) == n*log2(x).
    658  *
    659  * Gives accurate results all the time.
    660  * (Could be trivially extended to handle other power-of-two roots.)
    661  */
    662 static LLVMValueRef
    663 lp_build_ilog2_sqrt(struct lp_build_context *bld,
    664                     LLVMValueRef x)
    665 {
    666    LLVMBuilderRef builder = bld->gallivm->builder;
    667    LLVMValueRef ipart;
    668    struct lp_type i_type = lp_int_type(bld->type);
    669    LLVMValueRef one = lp_build_const_int_vec(bld->gallivm, i_type, 1);
    670 
    671    assert(bld->type.floating);
    672 
    673    assert(lp_check_value(bld->type, x));
    674 
    675    /* ipart = log2(x) + 0.5 = 0.5*(log2(x^2) + 1.0) */
    676    ipart = lp_build_extract_exponent(bld, x, 1);
    677    ipart = LLVMBuildAShr(builder, ipart, one, "");
    678 
    679    return ipart;
    680 }
    681 
    682 
    683 /**
    684  * Generate code to compute texture level of detail (lambda).
    685  * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
    686  * \param lod_bias  optional float vector with the shader lod bias
    687  * \param explicit_lod  optional float vector with the explicit lod
    688  * \param cube_rho  rho calculated by cube coord mapping (optional)
    689  * \param out_lod_ipart  integer part of lod
    690  * \param out_lod_fpart  float part of lod (never larger than 1 but may be negative)
    691  * \param out_lod_positive  (mask) if lod is positive (i.e. texture is minified)
    692  *
    693  * The resulting lod can be scalar per quad or be per element.
    694  */
    695 void
    696 lp_build_lod_selector(struct lp_build_sample_context *bld,
    697                       boolean is_lodq,
    698                       unsigned texture_unit,
    699                       unsigned sampler_unit,
    700                       LLVMValueRef s,
    701                       LLVMValueRef t,
    702                       LLVMValueRef r,
    703                       LLVMValueRef cube_rho,
    704                       const struct lp_derivatives *derivs,
    705                       LLVMValueRef lod_bias, /* optional */
    706                       LLVMValueRef explicit_lod, /* optional */
    707                       unsigned mip_filter,
    708                       LLVMValueRef *out_lod,
    709                       LLVMValueRef *out_lod_ipart,
    710                       LLVMValueRef *out_lod_fpart,
    711                       LLVMValueRef *out_lod_positive)
    712 
    713 {
    714    LLVMBuilderRef builder = bld->gallivm->builder;
    715    struct lp_sampler_dynamic_state *dynamic_state = bld->dynamic_state;
    716    struct lp_build_context *lodf_bld = &bld->lodf_bld;
    717    LLVMValueRef lod;
    718 
    719    *out_lod_ipart = bld->lodi_bld.zero;
    720    *out_lod_positive = bld->lodi_bld.zero;
    721    *out_lod_fpart = lodf_bld->zero;
    722 
    723    /*
    724     * For determining min/mag, we follow GL 4.1 spec, 3.9.12 Texture Magnification:
    725     * "Implementations may either unconditionally assume c = 0 for the minification
    726     * vs. magnification switch-over point, or may choose to make c depend on the
    727     * combination of minification and magnification modes as follows: if the
    728     * magnification filter is given by LINEAR and the minification filter is given
    729     * by NEAREST_MIPMAP_NEAREST or NEAREST_MIPMAP_LINEAR, then c = 0.5. This is
    730     * done to ensure that a minified texture does not appear "sharper" than a
    731     * magnified texture. Otherwise c = 0."
    732     * And 3.9.11 Texture Minification:
    733     * "If lod is less than or equal to the constant c (see section 3.9.12) the
    734     * texture is said to be magnified; if it is greater, the texture is minified."
    735     * So, using 0 as switchover point always, and using magnification for lod == 0.
    736     * Note that the always c = 0 behavior is new (first appearing in GL 3.1 spec),
    737     * old GL versions required 0.5 for the modes listed above.
    738     * I have no clue about the (undocumented) wishes of d3d9/d3d10 here!
    739     */
    740 
    741    if (bld->static_sampler_state->min_max_lod_equal && !is_lodq) {
    742       /* User is forcing sampling from a particular mipmap level.
    743        * This is hit during mipmap generation.
    744        */
    745       LLVMValueRef min_lod =
    746          dynamic_state->min_lod(dynamic_state, bld->gallivm,
    747                                 bld->context_ptr, sampler_unit);
    748 
    749       lod = lp_build_broadcast_scalar(lodf_bld, min_lod);
    750    }
    751    else {
    752       if (explicit_lod) {
    753          if (bld->num_lods != bld->coord_type.length)
    754             lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
    755                                             lodf_bld->type, explicit_lod, 0);
    756          else
    757             lod = explicit_lod;
    758       }
    759       else {
    760          LLVMValueRef rho;
    761          boolean rho_squared = (bld->no_rho_approx &&
    762                                 (bld->dims > 1)) || cube_rho;
    763 
    764          rho = lp_build_rho(bld, texture_unit, s, t, r, cube_rho, derivs);
    765 
    766          /*
    767           * Compute lod = log2(rho)
    768           */
    769 
    770          if (!lod_bias && !is_lodq &&
    771              !bld->static_sampler_state->lod_bias_non_zero &&
    772              !bld->static_sampler_state->apply_max_lod &&
    773              !bld->static_sampler_state->apply_min_lod) {
    774             /*
    775              * Special case when there are no post-log2 adjustments, which
    776              * saves instructions but keeping the integer and fractional lod
    777              * computations separate from the start.
    778              */
    779 
    780             if (mip_filter == PIPE_TEX_MIPFILTER_NONE ||
    781                 mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
    782                /*
    783                 * Don't actually need both values all the time, lod_ipart is
    784                 * needed for nearest mipfilter, lod_positive if min != mag.
    785                 */
    786                if (rho_squared) {
    787                   *out_lod_ipart = lp_build_ilog2_sqrt(lodf_bld, rho);
    788                }
    789                else {
    790                   *out_lod_ipart = lp_build_ilog2(lodf_bld, rho);
    791                }
    792                *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER,
    793                                                 rho, lodf_bld->one);
    794                return;
    795             }
    796             if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
    797                 !bld->no_brilinear && !rho_squared) {
    798                /*
    799                 * This can't work if rho is squared. Not sure if it could be
    800                 * fixed while keeping it worthwile, could also do sqrt here
    801                 * but brilinear and no_rho_opt seems like a combination not
    802                 * making much sense anyway so just use ordinary path below.
    803                 */
    804                lp_build_brilinear_rho(lodf_bld, rho, BRILINEAR_FACTOR,
    805                                       out_lod_ipart, out_lod_fpart);
    806                *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER,
    807                                                 rho, lodf_bld->one);
    808                return;
    809             }
    810          }
    811 
    812          if (0) {
    813             lod = lp_build_log2(lodf_bld, rho);
    814          }
    815          else {
    816             lod = lp_build_fast_log2(lodf_bld, rho);
    817          }
    818          if (rho_squared) {
    819             /* log2(x^2) == 0.5*log2(x) */
    820             lod = lp_build_mul(lodf_bld, lod,
    821                                lp_build_const_vec(bld->gallivm, lodf_bld->type, 0.5F));
    822          }
    823 
    824          /* add shader lod bias */
    825          if (lod_bias) {
    826             if (bld->num_lods != bld->coord_type.length)
    827                lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
    828                                                     lodf_bld->type, lod_bias, 0);
    829             lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias");
    830          }
    831       }
    832 
    833       /* add sampler lod bias */
    834       if (bld->static_sampler_state->lod_bias_non_zero) {
    835          LLVMValueRef sampler_lod_bias =
    836             dynamic_state->lod_bias(dynamic_state, bld->gallivm,
    837                                     bld->context_ptr, sampler_unit);
    838          sampler_lod_bias = lp_build_broadcast_scalar(lodf_bld,
    839                                                       sampler_lod_bias);
    840          lod = LLVMBuildFAdd(builder, lod, sampler_lod_bias, "sampler_lod_bias");
    841       }
    842 
    843       if (is_lodq) {
    844          *out_lod = lod;
    845       }
    846 
    847       /* clamp lod */
    848       if (bld->static_sampler_state->apply_max_lod) {
    849          LLVMValueRef max_lod =
    850             dynamic_state->max_lod(dynamic_state, bld->gallivm,
    851                                    bld->context_ptr, sampler_unit);
    852          max_lod = lp_build_broadcast_scalar(lodf_bld, max_lod);
    853 
    854          lod = lp_build_min(lodf_bld, lod, max_lod);
    855       }
    856       if (bld->static_sampler_state->apply_min_lod) {
    857          LLVMValueRef min_lod =
    858             dynamic_state->min_lod(dynamic_state, bld->gallivm,
    859                                    bld->context_ptr, sampler_unit);
    860          min_lod = lp_build_broadcast_scalar(lodf_bld, min_lod);
    861 
    862          lod = lp_build_max(lodf_bld, lod, min_lod);
    863       }
    864 
    865       if (is_lodq) {
    866          *out_lod_fpart = lod;
    867          return;
    868       }
    869    }
    870 
    871    *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER,
    872                                     lod, lodf_bld->zero);
    873 
    874    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
    875       if (!bld->no_brilinear) {
    876          lp_build_brilinear_lod(lodf_bld, lod, BRILINEAR_FACTOR,
    877                                 out_lod_ipart, out_lod_fpart);
    878       }
    879       else {
    880          lp_build_ifloor_fract(lodf_bld, lod, out_lod_ipart, out_lod_fpart);
    881       }
    882 
    883       lp_build_name(*out_lod_fpart, "lod_fpart");
    884    }
    885    else {
    886       *out_lod_ipart = lp_build_iround(lodf_bld, lod);
    887    }
    888 
    889    lp_build_name(*out_lod_ipart, "lod_ipart");
    890 
    891    return;
    892 }
    893 
    894 
    895 /**
    896  * For PIPE_TEX_MIPFILTER_NEAREST, convert int part of lod
    897  * to actual mip level.
    898  * Note: this is all scalar per quad code.
    899  * \param lod_ipart  int texture level of detail
    900  * \param level_out  returns integer
    901  * \param out_of_bounds returns per coord out_of_bounds mask if provided
    902  */
    903 void
    904 lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
    905                            unsigned texture_unit,
    906                            LLVMValueRef lod_ipart,
    907                            LLVMValueRef *level_out,
    908                            LLVMValueRef *out_of_bounds)
    909 {
    910    struct lp_build_context *leveli_bld = &bld->leveli_bld;
    911    struct lp_sampler_dynamic_state *dynamic_state = bld->dynamic_state;
    912    LLVMValueRef first_level, last_level, level;
    913 
    914    first_level = dynamic_state->first_level(dynamic_state, bld->gallivm,
    915                                             bld->context_ptr, texture_unit);
    916    last_level = dynamic_state->last_level(dynamic_state, bld->gallivm,
    917                                           bld->context_ptr, texture_unit);
    918    first_level = lp_build_broadcast_scalar(leveli_bld, first_level);
    919    last_level = lp_build_broadcast_scalar(leveli_bld, last_level);
    920 
    921    level = lp_build_add(leveli_bld, lod_ipart, first_level);
    922 
    923    if (out_of_bounds) {
    924       LLVMValueRef out, out1;
    925       out = lp_build_cmp(leveli_bld, PIPE_FUNC_LESS, level, first_level);
    926       out1 = lp_build_cmp(leveli_bld, PIPE_FUNC_GREATER, level, last_level);
    927       out = lp_build_or(leveli_bld, out, out1);
    928       if (bld->num_mips == bld->coord_bld.type.length) {
    929          *out_of_bounds = out;
    930       }
    931       else if (bld->num_mips == 1) {
    932          *out_of_bounds = lp_build_broadcast_scalar(&bld->int_coord_bld, out);
    933       }
    934       else {
    935          assert(bld->num_mips == bld->coord_bld.type.length / 4);
    936          *out_of_bounds = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
    937                                                                 leveli_bld->type,
    938                                                                 bld->int_coord_bld.type,
    939                                                                 out);
    940       }
    941       level = lp_build_andnot(&bld->int_coord_bld, level, *out_of_bounds);
    942       *level_out = level;
    943    }
    944    else {
    945       /* clamp level to legal range of levels */
    946       *level_out = lp_build_clamp(leveli_bld, level, first_level, last_level);
    947 
    948    }
    949 }
    950 
    951 
    952 /**
    953  * For PIPE_TEX_MIPFILTER_LINEAR, convert per-quad (or per element) int LOD(s)
    954  * to two (per-quad) (adjacent) mipmap level indexes, and fix up float lod
    955  * part accordingly.
    956  * Later, we'll sample from those two mipmap levels and interpolate between them.
    957  */
    958 void
    959 lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
    960                            unsigned texture_unit,
    961                            LLVMValueRef lod_ipart,
    962                            LLVMValueRef *lod_fpart_inout,
    963                            LLVMValueRef *level0_out,
    964                            LLVMValueRef *level1_out)
    965 {
    966    LLVMBuilderRef builder = bld->gallivm->builder;
    967    struct lp_sampler_dynamic_state *dynamic_state = bld->dynamic_state;
    968    struct lp_build_context *leveli_bld = &bld->leveli_bld;
    969    struct lp_build_context *levelf_bld = &bld->levelf_bld;
    970    LLVMValueRef first_level, last_level;
    971    LLVMValueRef clamp_min;
    972    LLVMValueRef clamp_max;
    973 
    974    assert(bld->num_lods == bld->num_mips);
    975 
    976    first_level = dynamic_state->first_level(dynamic_state, bld->gallivm,
    977                                             bld->context_ptr, texture_unit);
    978    last_level = dynamic_state->last_level(dynamic_state, bld->gallivm,
    979                                           bld->context_ptr, texture_unit);
    980    first_level = lp_build_broadcast_scalar(leveli_bld, first_level);
    981    last_level = lp_build_broadcast_scalar(leveli_bld, last_level);
    982 
    983    *level0_out = lp_build_add(leveli_bld, lod_ipart, first_level);
    984    *level1_out = lp_build_add(leveli_bld, *level0_out, leveli_bld->one);
    985 
    986    /*
    987     * Clamp both *level0_out and *level1_out to [first_level, last_level], with
    988     * the minimum number of comparisons, and zeroing lod_fpart in the extreme
    989     * ends in the process.
    990     */
    991 
    992    /* *level0_out < first_level */
    993    clamp_min = LLVMBuildICmp(builder, LLVMIntSLT,
    994                              *level0_out, first_level,
    995                              "clamp_lod_to_first");
    996 
    997    *level0_out = LLVMBuildSelect(builder, clamp_min,
    998                                  first_level, *level0_out, "");
    999 
   1000    *level1_out = LLVMBuildSelect(builder, clamp_min,
   1001                                  first_level, *level1_out, "");
   1002 
   1003    *lod_fpart_inout = LLVMBuildSelect(builder, clamp_min,
   1004                                       levelf_bld->zero, *lod_fpart_inout, "");
   1005 
   1006    /* *level0_out >= last_level */
   1007    clamp_max = LLVMBuildICmp(builder, LLVMIntSGE,
   1008                              *level0_out, last_level,
   1009                              "clamp_lod_to_last");
   1010 
   1011    *level0_out = LLVMBuildSelect(builder, clamp_max,
   1012                                  last_level, *level0_out, "");
   1013 
   1014    *level1_out = LLVMBuildSelect(builder, clamp_max,
   1015                                  last_level, *level1_out, "");
   1016 
   1017    *lod_fpart_inout = LLVMBuildSelect(builder, clamp_max,
   1018                                       levelf_bld->zero, *lod_fpart_inout, "");
   1019 
   1020    lp_build_name(*level0_out, "texture%u_miplevel0", texture_unit);
   1021    lp_build_name(*level1_out, "texture%u_miplevel1", texture_unit);
   1022    lp_build_name(*lod_fpart_inout, "texture%u_mipweight", texture_unit);
   1023 }
   1024 
   1025 
   1026 /**
   1027  * Return pointer to a single mipmap level.
   1028  * \param level  integer mipmap level
   1029  */
   1030 LLVMValueRef
   1031 lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
   1032                           LLVMValueRef level)
   1033 {
   1034    LLVMBuilderRef builder = bld->gallivm->builder;
   1035    LLVMValueRef indexes[2], data_ptr, mip_offset;
   1036 
   1037    indexes[0] = lp_build_const_int32(bld->gallivm, 0);
   1038    indexes[1] = level;
   1039    mip_offset = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
   1040    mip_offset = LLVMBuildLoad(builder, mip_offset, "");
   1041    data_ptr = LLVMBuildGEP(builder, bld->base_ptr, &mip_offset, 1, "");
   1042    return data_ptr;
   1043 }
   1044 
   1045 /**
   1046  * Return (per-pixel) offsets to mip levels.
   1047  * \param level  integer mipmap level
   1048  */
   1049 LLVMValueRef
   1050 lp_build_get_mip_offsets(struct lp_build_sample_context *bld,
   1051                          LLVMValueRef level)
   1052 {
   1053    LLVMBuilderRef builder = bld->gallivm->builder;
   1054    LLVMValueRef indexes[2], offsets, offset1;
   1055 
   1056    indexes[0] = lp_build_const_int32(bld->gallivm, 0);
   1057    if (bld->num_mips == 1) {
   1058       indexes[1] = level;
   1059       offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
   1060       offset1 = LLVMBuildLoad(builder, offset1, "");
   1061       offsets = lp_build_broadcast_scalar(&bld->int_coord_bld, offset1);
   1062    }
   1063    else if (bld->num_mips == bld->coord_bld.type.length / 4) {
   1064       unsigned i;
   1065 
   1066       offsets = bld->int_coord_bld.undef;
   1067       for (i = 0; i < bld->num_mips; i++) {
   1068          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
   1069          LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i);
   1070          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
   1071          offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
   1072          offset1 = LLVMBuildLoad(builder, offset1, "");
   1073          offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexo, "");
   1074       }
   1075       offsets = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, offsets, 0, 4);
   1076    }
   1077    else {
   1078       unsigned i;
   1079 
   1080       assert (bld->num_mips == bld->coord_bld.type.length);
   1081 
   1082       offsets = bld->int_coord_bld.undef;
   1083       for (i = 0; i < bld->num_mips; i++) {
   1084          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
   1085          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
   1086          offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
   1087          offset1 = LLVMBuildLoad(builder, offset1, "");
   1088          offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexi, "");
   1089       }
   1090    }
   1091    return offsets;
   1092 }
   1093 
   1094 
   1095 /**
   1096  * Codegen equivalent for u_minify().
   1097  * @param lod_scalar  if lod is a (broadcasted) scalar
   1098  * Return max(1, base_size >> level);
   1099  */
   1100 LLVMValueRef
   1101 lp_build_minify(struct lp_build_context *bld,
   1102                 LLVMValueRef base_size,
   1103                 LLVMValueRef level,
   1104                 boolean lod_scalar)
   1105 {
   1106    LLVMBuilderRef builder = bld->gallivm->builder;
   1107    assert(lp_check_value(bld->type, base_size));
   1108    assert(lp_check_value(bld->type, level));
   1109 
   1110    if (level == bld->zero) {
   1111       /* if we're using mipmap level zero, no minification is needed */
   1112       return base_size;
   1113    }
   1114    else {
   1115       LLVMValueRef size;
   1116       assert(bld->type.sign);
   1117       if (lod_scalar ||
   1118          (util_cpu_caps.has_avx2 || !util_cpu_caps.has_sse)) {
   1119          size = LLVMBuildLShr(builder, base_size, level, "minify");
   1120          size = lp_build_max(bld, size, bld->one);
   1121       }
   1122       else {
   1123          /*
   1124           * emulate shift with float mul, since intel "forgot" shifts with
   1125           * per-element shift count until avx2, which results in terrible
   1126           * scalar extraction (both count and value), scalar shift,
   1127           * vector reinsertion. Should not be an issue on any non-x86 cpu
   1128           * with a vector instruction set.
   1129           * On cpus with AMD's XOP this should also be unnecessary but I'm
   1130           * not sure if llvm would emit this with current flags.
   1131           */
   1132          LLVMValueRef const127, const23, lf;
   1133          struct lp_type ftype;
   1134          struct lp_build_context fbld;
   1135          ftype = lp_type_float_vec(32, bld->type.length * bld->type.width);
   1136          lp_build_context_init(&fbld, bld->gallivm, ftype);
   1137          const127 = lp_build_const_int_vec(bld->gallivm, bld->type, 127);
   1138          const23 = lp_build_const_int_vec(bld->gallivm, bld->type, 23);
   1139 
   1140          /* calculate 2^(-level) float */
   1141          lf = lp_build_sub(bld, const127, level);
   1142          lf = lp_build_shl(bld, lf, const23);
   1143          lf = LLVMBuildBitCast(builder, lf, fbld.vec_type, "");
   1144 
   1145          /* finish shift operation by doing float mul */
   1146          base_size = lp_build_int_to_float(&fbld, base_size);
   1147          size = lp_build_mul(&fbld, base_size, lf);
   1148          /*
   1149           * do the max also with floats because
   1150           * a) non-emulated int max requires sse41
   1151           *    (this is actually a lie as we could cast to 16bit values
   1152           *    as 16bit is sufficient and 16bit int max is sse2)
   1153           * b) with avx we can do int max 4-wide but float max 8-wide
   1154           */
   1155          size = lp_build_max(&fbld, size, fbld.one);
   1156          size = lp_build_itrunc(&fbld, size);
   1157       }
   1158       return size;
   1159    }
   1160 }
   1161 
   1162 
   1163 /**
   1164  * Dereference stride_array[mipmap_level] array to get a stride.
   1165  * Return stride as a vector.
   1166  */
   1167 static LLVMValueRef
   1168 lp_build_get_level_stride_vec(struct lp_build_sample_context *bld,
   1169                               LLVMValueRef stride_array, LLVMValueRef level)
   1170 {
   1171    LLVMBuilderRef builder = bld->gallivm->builder;
   1172    LLVMValueRef indexes[2], stride, stride1;
   1173    indexes[0] = lp_build_const_int32(bld->gallivm, 0);
   1174    if (bld->num_mips == 1) {
   1175       indexes[1] = level;
   1176       stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
   1177       stride1 = LLVMBuildLoad(builder, stride1, "");
   1178       stride = lp_build_broadcast_scalar(&bld->int_coord_bld, stride1);
   1179    }
   1180    else if (bld->num_mips == bld->coord_bld.type.length / 4) {
   1181       LLVMValueRef stride1;
   1182       unsigned i;
   1183 
   1184       stride = bld->int_coord_bld.undef;
   1185       for (i = 0; i < bld->num_mips; i++) {
   1186          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
   1187          LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i);
   1188          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
   1189          stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
   1190          stride1 = LLVMBuildLoad(builder, stride1, "");
   1191          stride = LLVMBuildInsertElement(builder, stride, stride1, indexo, "");
   1192       }
   1193       stride = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, stride, 0, 4);
   1194    }
   1195    else {
   1196       LLVMValueRef stride1;
   1197       unsigned i;
   1198 
   1199       assert (bld->num_mips == bld->coord_bld.type.length);
   1200 
   1201       stride = bld->int_coord_bld.undef;
   1202       for (i = 0; i < bld->coord_bld.type.length; i++) {
   1203          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
   1204          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
   1205          stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
   1206          stride1 = LLVMBuildLoad(builder, stride1, "");
   1207          stride = LLVMBuildInsertElement(builder, stride, stride1, indexi, "");
   1208       }
   1209    }
   1210    return stride;
   1211 }
   1212 
   1213 
   1214 /**
   1215  * When sampling a mipmap, we need to compute the width, height, depth
   1216  * of the source levels from the level indexes.  This helper function
   1217  * does that.
   1218  */
   1219 void
   1220 lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
   1221                             LLVMValueRef ilevel,
   1222                             LLVMValueRef *out_size,
   1223                             LLVMValueRef *row_stride_vec,
   1224                             LLVMValueRef *img_stride_vec)
   1225 {
   1226    const unsigned dims = bld->dims;
   1227    LLVMValueRef ilevel_vec;
   1228 
   1229    /*
   1230     * Compute width, height, depth at mipmap level 'ilevel'
   1231     */
   1232    if (bld->num_mips == 1) {
   1233       ilevel_vec = lp_build_broadcast_scalar(&bld->int_size_bld, ilevel);
   1234       *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size, ilevel_vec, TRUE);
   1235    }
   1236    else {
   1237       LLVMValueRef int_size_vec;
   1238       LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
   1239       unsigned num_quads = bld->coord_bld.type.length / 4;
   1240       unsigned i;
   1241 
   1242       if (bld->num_mips == num_quads) {
   1243          /*
   1244           * XXX: this should be #ifndef SANE_INSTRUCTION_SET.
   1245           * intel "forgot" the variable shift count instruction until avx2.
   1246           * A harmless 8x32 shift gets translated into 32 instructions
   1247           * (16 extracts, 8 scalar shifts, 8 inserts), llvm is apparently
   1248           * unable to recognize if there are really just 2 different shift
   1249           * count values. So do the shift 4-wide before expansion.
   1250           */
   1251          struct lp_build_context bld4;
   1252          struct lp_type type4;
   1253 
   1254          type4 = bld->int_coord_bld.type;
   1255          type4.length = 4;
   1256 
   1257          lp_build_context_init(&bld4, bld->gallivm, type4);
   1258 
   1259          if (bld->dims == 1) {
   1260             assert(bld->int_size_in_bld.type.length == 1);
   1261             int_size_vec = lp_build_broadcast_scalar(&bld4,
   1262                                                      bld->int_size);
   1263          }
   1264          else {
   1265             assert(bld->int_size_in_bld.type.length == 4);
   1266             int_size_vec = bld->int_size;
   1267          }
   1268 
   1269          for (i = 0; i < num_quads; i++) {
   1270             LLVMValueRef ileveli;
   1271             LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
   1272 
   1273             ileveli = lp_build_extract_broadcast(bld->gallivm,
   1274                                                  bld->leveli_bld.type,
   1275                                                  bld4.type,
   1276                                                  ilevel,
   1277                                                  indexi);
   1278             tmp[i] = lp_build_minify(&bld4, int_size_vec, ileveli, TRUE);
   1279          }
   1280          /*
   1281           * out_size is [w0, h0, d0, _, w1, h1, d1, _, ...] vector for dims > 1,
   1282           * [w0, w0, w0, w0, w1, w1, w1, w1, ...] otherwise.
   1283           */
   1284          *out_size = lp_build_concat(bld->gallivm,
   1285                                      tmp,
   1286                                      bld4.type,
   1287                                      num_quads);
   1288       }
   1289       else {
   1290         /* FIXME: this is terrible and results in _huge_ vector
   1291          * (for the dims > 1 case).
   1292          * Should refactor this (together with extract_image_sizes) and do
   1293          * something more useful. Could for instance if we have width,height
   1294          * with 4-wide vector pack all elements into a 8xi16 vector
   1295          * (on which we can still do useful math) instead of using a 16xi32
   1296          * vector.
   1297          * For dims == 1 this will create [w0, w1, w2, w3, ...] vector.
   1298          * For dims > 1 this will create [w0, h0, d0, _, w1, h1, d1, _, ...] vector.
   1299          */
   1300          assert(bld->num_mips == bld->coord_bld.type.length);
   1301          if (bld->dims == 1) {
   1302             assert(bld->int_size_in_bld.type.length == 1);
   1303             int_size_vec = lp_build_broadcast_scalar(&bld->int_coord_bld,
   1304                                                      bld->int_size);
   1305             *out_size = lp_build_minify(&bld->int_coord_bld, int_size_vec, ilevel, FALSE);
   1306          }
   1307          else {
   1308             LLVMValueRef ilevel1;
   1309             for (i = 0; i < bld->num_mips; i++) {
   1310                LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
   1311                ilevel1 = lp_build_extract_broadcast(bld->gallivm, bld->int_coord_type,
   1312                                                     bld->int_size_in_bld.type, ilevel, indexi);
   1313                tmp[i] = bld->int_size;
   1314                tmp[i] = lp_build_minify(&bld->int_size_in_bld, tmp[i], ilevel1, TRUE);
   1315             }
   1316             *out_size = lp_build_concat(bld->gallivm, tmp,
   1317                                         bld->int_size_in_bld.type,
   1318                                         bld->num_mips);
   1319          }
   1320       }
   1321    }
   1322 
   1323    if (dims >= 2) {
   1324       *row_stride_vec = lp_build_get_level_stride_vec(bld,
   1325                                                       bld->row_stride_array,
   1326                                                       ilevel);
   1327    }
   1328    if (dims == 3 || has_layer_coord(bld->static_texture_state->target)) {
   1329       *img_stride_vec = lp_build_get_level_stride_vec(bld,
   1330                                                       bld->img_stride_array,
   1331                                                       ilevel);
   1332    }
   1333 }
   1334 
   1335 
   1336 /**
   1337  * Extract and broadcast texture size.
   1338  *
   1339  * @param size_type   type of the texture size vector (either
   1340  *                    bld->int_size_type or bld->float_size_type)
   1341  * @param coord_type  type of the texture size vector (either
   1342  *                    bld->int_coord_type or bld->coord_type)
   1343  * @param size        vector with the texture size (width, height, depth)
   1344  */
   1345 void
   1346 lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
   1347                              struct lp_build_context *size_bld,
   1348                              struct lp_type coord_type,
   1349                              LLVMValueRef size,
   1350                              LLVMValueRef *out_width,
   1351                              LLVMValueRef *out_height,
   1352                              LLVMValueRef *out_depth)
   1353 {
   1354    const unsigned dims = bld->dims;
   1355    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
   1356    struct lp_type size_type = size_bld->type;
   1357 
   1358    if (bld->num_mips == 1) {
   1359       *out_width = lp_build_extract_broadcast(bld->gallivm,
   1360                                               size_type,
   1361                                               coord_type,
   1362                                               size,
   1363                                               LLVMConstInt(i32t, 0, 0));
   1364       if (dims >= 2) {
   1365          *out_height = lp_build_extract_broadcast(bld->gallivm,
   1366                                                   size_type,
   1367                                                   coord_type,
   1368                                                   size,
   1369                                                   LLVMConstInt(i32t, 1, 0));
   1370          if (dims == 3) {
   1371             *out_depth = lp_build_extract_broadcast(bld->gallivm,
   1372                                                     size_type,
   1373                                                     coord_type,
   1374                                                     size,
   1375                                                     LLVMConstInt(i32t, 2, 0));
   1376          }
   1377       }
   1378    }
   1379    else {
   1380       unsigned num_quads = bld->coord_bld.type.length / 4;
   1381 
   1382       if (dims == 1) {
   1383          *out_width = size;
   1384       }
   1385       else if (bld->num_mips == num_quads) {
   1386          *out_width = lp_build_swizzle_scalar_aos(size_bld, size, 0, 4);
   1387          if (dims >= 2) {
   1388             *out_height = lp_build_swizzle_scalar_aos(size_bld, size, 1, 4);
   1389             if (dims == 3) {
   1390                *out_depth = lp_build_swizzle_scalar_aos(size_bld, size, 2, 4);
   1391             }
   1392          }
   1393       }
   1394       else {
   1395          assert(bld->num_mips == bld->coord_type.length);
   1396          *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
   1397                                                 coord_type, size, 0);
   1398          if (dims >= 2) {
   1399             *out_height = lp_build_pack_aos_scalars(bld->gallivm, size_type,
   1400                                                     coord_type, size, 1);
   1401             if (dims == 3) {
   1402                *out_depth = lp_build_pack_aos_scalars(bld->gallivm, size_type,
   1403                                                       coord_type, size, 2);
   1404             }
   1405          }
   1406       }
   1407    }
   1408 }
   1409 
   1410 
   1411 /**
   1412  * Unnormalize coords.
   1413  *
   1414  * @param flt_size  vector with the integer texture size (width, height, depth)
   1415  */
   1416 void
   1417 lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
   1418                              LLVMValueRef flt_size,
   1419                              LLVMValueRef *s,
   1420                              LLVMValueRef *t,
   1421                              LLVMValueRef *r)
   1422 {
   1423    const unsigned dims = bld->dims;
   1424    LLVMValueRef width;
   1425    LLVMValueRef height = NULL;
   1426    LLVMValueRef depth = NULL;
   1427 
   1428    lp_build_extract_image_sizes(bld,
   1429                                 &bld->float_size_bld,
   1430                                 bld->coord_type,
   1431                                 flt_size,
   1432                                 &width,
   1433                                 &height,
   1434                                 &depth);
   1435 
   1436    /* s = s * width, t = t * height */
   1437    *s = lp_build_mul(&bld->coord_bld, *s, width);
   1438    if (dims >= 2) {
   1439       *t = lp_build_mul(&bld->coord_bld, *t, height);
   1440       if (dims >= 3) {
   1441          *r = lp_build_mul(&bld->coord_bld, *r, depth);
   1442       }
   1443    }
   1444 }
   1445 
   1446 /**
   1447  * Generate new coords and faces for cubemap texels falling off the face.
   1448  *
   1449  * @param face   face (center) of the pixel
   1450  * @param x0     lower x coord
   1451  * @param x1     higher x coord (must be x0 + 1)
   1452  * @param y0     lower y coord
   1453  * @param y1     higher y coord (must be x0 + 1)
   1454  * @param max_coord     texture cube (level) size - 1
   1455  * @param next_faces    new face values when falling off
   1456  * @param next_xcoords  new x coord values when falling off
   1457  * @param next_ycoords  new y coord values when falling off
   1458  *
   1459  * The arrays hold the new values when under/overflow of
   1460  * lower x, higher x, lower y, higher y coord would occur (in this order).
   1461  * next_xcoords/next_ycoords have two entries each (for both new lower and
   1462  * higher coord).
   1463  */
   1464 void
   1465 lp_build_cube_new_coords(struct lp_build_context *ivec_bld,
   1466                         LLVMValueRef face,
   1467                         LLVMValueRef x0,
   1468                         LLVMValueRef x1,
   1469                         LLVMValueRef y0,
   1470                         LLVMValueRef y1,
   1471                         LLVMValueRef max_coord,
   1472                         LLVMValueRef next_faces[4],
   1473                         LLVMValueRef next_xcoords[4][2],
   1474                         LLVMValueRef next_ycoords[4][2])
   1475 {
   1476    /*
   1477     * Lookup tables aren't nice for simd code hence try some logic here.
   1478     * (Note that while it would not be necessary to do per-sample (4) lookups
   1479     * when using a LUT as it's impossible that texels fall off of positive
   1480     * and negative edges simultaneously, it would however be necessary to
   1481     * do 2 lookups for corner handling as in this case texels both fall off
   1482     * of x and y axes.)
   1483     */
   1484    /*
   1485     * Next faces (for face 012345):
   1486     * x < 0.0  : 451110
   1487     * x >= 1.0 : 540001
   1488     * y < 0.0  : 225422
   1489     * y >= 1.0 : 334533
   1490     * Hence nfx+ (and nfy+) == nfx- (nfy-) xor 1
   1491     * nfx-: face > 1 ? (face == 5 ? 0 : 1) : (4 + face & 1)
   1492     * nfy+: face & ~4 > 1 ? face + 2 : 3;
   1493     * This could also use pshufb instead, but would need (manually coded)
   1494     * ssse3 intrinsic (llvm won't do non-constant shuffles).
   1495     */
   1496    struct gallivm_state *gallivm = ivec_bld->gallivm;
   1497    LLVMValueRef sel, sel_f2345, sel_f23, sel_f2, tmpsel, tmp;
   1498    LLVMValueRef faceand1, sel_fand1, maxmx0, maxmx1, maxmy0, maxmy1;
   1499    LLVMValueRef c2 = lp_build_const_int_vec(gallivm, ivec_bld->type, 2);
   1500    LLVMValueRef c3 = lp_build_const_int_vec(gallivm, ivec_bld->type, 3);
   1501    LLVMValueRef c4 = lp_build_const_int_vec(gallivm, ivec_bld->type, 4);
   1502    LLVMValueRef c5 = lp_build_const_int_vec(gallivm, ivec_bld->type, 5);
   1503 
   1504    sel = lp_build_cmp(ivec_bld, PIPE_FUNC_EQUAL, face, c5);
   1505    tmpsel = lp_build_select(ivec_bld, sel, ivec_bld->zero, ivec_bld->one);
   1506    sel_f2345 = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, face, ivec_bld->one);
   1507    faceand1 = lp_build_and(ivec_bld, face, ivec_bld->one);
   1508    tmp = lp_build_add(ivec_bld, faceand1, c4);
   1509    next_faces[0] = lp_build_select(ivec_bld, sel_f2345, tmpsel, tmp);
   1510    next_faces[1] = lp_build_xor(ivec_bld, next_faces[0], ivec_bld->one);
   1511 
   1512    tmp = lp_build_andnot(ivec_bld, face, c4);
   1513    sel_f23 = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, tmp, ivec_bld->one);
   1514    tmp = lp_build_add(ivec_bld, face, c2);
   1515    next_faces[3] = lp_build_select(ivec_bld, sel_f23, tmp, c3);
   1516    next_faces[2] = lp_build_xor(ivec_bld, next_faces[3], ivec_bld->one);
   1517 
   1518    /*
   1519     * new xcoords (for face 012345):
   1520     * x < 0.0  : max   max   t     max-t max  max
   1521     * x >= 1.0 : 0     0     max-t t     0    0
   1522     * y < 0.0  : max   0     max-s s     s    max-s
   1523     * y >= 1.0 : max   0     s     max-s s    max-s
   1524     *
   1525     * ncx[1] = face & ~4 > 1 ? (face == 2 ? max-t : t) : 0
   1526     * ncx[0] = max - ncx[1]
   1527     * ncx[3] = face > 1 ? (face & 1 ? max-s : s) : (face & 1) ? 0 : max
   1528     * ncx[2] = face & ~4 > 1 ? max - ncx[3] : ncx[3]
   1529     */
   1530    sel_f2 = lp_build_cmp(ivec_bld, PIPE_FUNC_EQUAL, face, c2);
   1531    maxmy0 = lp_build_sub(ivec_bld, max_coord, y0);
   1532    tmp = lp_build_select(ivec_bld, sel_f2, maxmy0, y0);
   1533    next_xcoords[1][0] = lp_build_select(ivec_bld, sel_f23, tmp, ivec_bld->zero);
   1534    next_xcoords[0][0] = lp_build_sub(ivec_bld, max_coord, next_xcoords[1][0]);
   1535    maxmy1 = lp_build_sub(ivec_bld, max_coord, y1);
   1536    tmp = lp_build_select(ivec_bld, sel_f2, maxmy1, y1);
   1537    next_xcoords[1][1] = lp_build_select(ivec_bld, sel_f23, tmp, ivec_bld->zero);
   1538    next_xcoords[0][1] = lp_build_sub(ivec_bld, max_coord, next_xcoords[1][1]);
   1539 
   1540    sel_fand1 = lp_build_cmp(ivec_bld, PIPE_FUNC_EQUAL, faceand1, ivec_bld->one);
   1541 
   1542    tmpsel = lp_build_select(ivec_bld, sel_fand1, ivec_bld->zero, max_coord);
   1543    maxmx0 = lp_build_sub(ivec_bld, max_coord, x0);
   1544    tmp = lp_build_select(ivec_bld, sel_fand1, maxmx0, x0);
   1545    next_xcoords[3][0] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel);
   1546    tmp = lp_build_sub(ivec_bld, max_coord, next_xcoords[3][0]);
   1547    next_xcoords[2][0] = lp_build_select(ivec_bld, sel_f23, tmp, next_xcoords[3][0]);
   1548    maxmx1 = lp_build_sub(ivec_bld, max_coord, x1);
   1549    tmp = lp_build_select(ivec_bld, sel_fand1, maxmx1, x1);
   1550    next_xcoords[3][1] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel);
   1551    tmp = lp_build_sub(ivec_bld, max_coord, next_xcoords[3][1]);
   1552    next_xcoords[2][1] = lp_build_select(ivec_bld, sel_f23, tmp, next_xcoords[3][1]);
   1553 
   1554    /*
   1555     * new ycoords (for face 012345):
   1556     * x < 0.0  : t     t     0     max   t    t
   1557     * x >= 1.0 : t     t     0     max   t    t
   1558     * y < 0.0  : max-s s     0     max   max  0
   1559     * y >= 1.0 : s     max-s 0     max   0    max
   1560     *
   1561     * ncy[0] = face & ~4 > 1 ? (face == 2 ? 0 : max) : t
   1562     * ncy[1] = ncy[0]
   1563     * ncy[3] = face > 1 ? (face & 1 ? max : 0) : (face & 1) ? max-s : max
   1564     * ncx[2] = face & ~4 > 1 ? max - ncx[3] : ncx[3]
   1565     */
   1566    tmp = lp_build_select(ivec_bld, sel_f2, ivec_bld->zero, max_coord);
   1567    next_ycoords[0][0] = lp_build_select(ivec_bld, sel_f23, tmp, y0);
   1568    next_ycoords[1][0] = next_ycoords[0][0];
   1569    next_ycoords[0][1] = lp_build_select(ivec_bld, sel_f23, tmp, y1);
   1570    next_ycoords[1][1] = next_ycoords[0][1];
   1571 
   1572    tmpsel = lp_build_select(ivec_bld, sel_fand1, maxmx0, x0);
   1573    tmp = lp_build_select(ivec_bld, sel_fand1, max_coord, ivec_bld->zero);
   1574    next_ycoords[3][0] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel);
   1575    tmp = lp_build_sub(ivec_bld, max_coord, next_ycoords[3][0]);
   1576    next_ycoords[2][0] = lp_build_select(ivec_bld, sel_f23, next_ycoords[3][0], tmp);
   1577    tmpsel = lp_build_select(ivec_bld, sel_fand1, maxmx1, x1);
   1578    tmp = lp_build_select(ivec_bld, sel_fand1, max_coord, ivec_bld->zero);
   1579    next_ycoords[3][1] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel);
   1580    tmp = lp_build_sub(ivec_bld, max_coord, next_ycoords[3][1]);
   1581    next_ycoords[2][1] = lp_build_select(ivec_bld, sel_f23, next_ycoords[3][1], tmp);
   1582 }
   1583 
   1584 
   1585 /** Helper used by lp_build_cube_lookup() */
   1586 static LLVMValueRef
   1587 lp_build_cube_imapos(struct lp_build_context *coord_bld, LLVMValueRef coord)
   1588 {
   1589    /* ima = +0.5 / abs(coord); */
   1590    LLVMValueRef posHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5);
   1591    LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
   1592    LLVMValueRef ima = lp_build_div(coord_bld, posHalf, absCoord);
   1593    return ima;
   1594 }
   1595 
   1596 
   1597 /** Helper for doing 3-wise selection.
   1598  * Returns sel1 ? val2 : (sel0 ? val0 : val1).
   1599  */
   1600 static LLVMValueRef
   1601 lp_build_select3(struct lp_build_context *sel_bld,
   1602                  LLVMValueRef sel0,
   1603                  LLVMValueRef sel1,
   1604                  LLVMValueRef val0,
   1605                  LLVMValueRef val1,
   1606                  LLVMValueRef val2)
   1607 {
   1608    LLVMValueRef tmp;
   1609    tmp = lp_build_select(sel_bld, sel0, val0, val1);
   1610    return lp_build_select(sel_bld, sel1, val2, tmp);
   1611 }
   1612 
   1613 
   1614 /**
   1615  * Generate code to do cube face selection and compute per-face texcoords.
   1616  */
   1617 void
   1618 lp_build_cube_lookup(struct lp_build_sample_context *bld,
   1619                      LLVMValueRef *coords,
   1620                      const struct lp_derivatives *derivs_in, /* optional */
   1621                      LLVMValueRef *rho,
   1622                      struct lp_derivatives *derivs_out, /* optional */
   1623                      boolean need_derivs)
   1624 {
   1625    struct lp_build_context *coord_bld = &bld->coord_bld;
   1626    LLVMBuilderRef builder = bld->gallivm->builder;
   1627    struct gallivm_state *gallivm = bld->gallivm;
   1628    LLVMValueRef si, ti, ri;
   1629 
   1630    /*
   1631     * Do per-pixel face selection. We cannot however (as we used to do)
   1632     * simply calculate the derivs afterwards (which is very bogus for
   1633     * explicit derivs btw) because the values would be "random" when
   1634     * not all pixels lie on the same face. So what we do here is just
   1635     * calculate the derivatives after scaling the coords by the absolute
   1636     * value of the inverse major axis, and essentially do rho calculation
   1637     * steps as if it were a 3d texture. This is perfect if all pixels hit
   1638     * the same face, but not so great at edges, I believe the max error
   1639     * should be sqrt(2) with no_rho_approx or 2 otherwise (essentially measuring
   1640     * the 3d distance between 2 points on the cube instead of measuring up/down
   1641     * the edge). Still this is possibly a win over just selecting the same face
   1642     * for all pixels. Unfortunately, something like that doesn't work for
   1643     * explicit derivatives.
   1644     */
   1645    struct lp_build_context *cint_bld = &bld->int_coord_bld;
   1646    struct lp_type intctype = cint_bld->type;
   1647    LLVMTypeRef coord_vec_type = coord_bld->vec_type;
   1648    LLVMTypeRef cint_vec_type = cint_bld->vec_type;
   1649    LLVMValueRef as, at, ar, face, face_s, face_t;
   1650    LLVMValueRef as_ge_at, maxasat, ar_ge_as_at;
   1651    LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz;
   1652    LLVMValueRef tnegi, rnegi;
   1653    LLVMValueRef ma, mai, signma, signmabit, imahalfpos;
   1654    LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5);
   1655    LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype,
   1656                                                   1LL << (intctype.width - 1));
   1657    LLVMValueRef signshift = lp_build_const_int_vec(gallivm, intctype,
   1658                                                    intctype.width -1);
   1659    LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X);
   1660    LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y);
   1661    LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z);
   1662    LLVMValueRef s = coords[0];
   1663    LLVMValueRef t = coords[1];
   1664    LLVMValueRef r = coords[2];
   1665 
   1666    assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1);
   1667    assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1);
   1668    assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1);
   1669 
   1670    /*
   1671     * get absolute value (for x/y/z face selection) and sign bit
   1672     * (for mirroring minor coords and pos/neg face selection)
   1673     * of the original coords.
   1674     */
   1675    as = lp_build_abs(&bld->coord_bld, s);
   1676    at = lp_build_abs(&bld->coord_bld, t);
   1677    ar = lp_build_abs(&bld->coord_bld, r);
   1678 
   1679    /*
   1680     * major face determination: select x if x > y else select y
   1681     * select z if z >= max(x,y) else select previous result
   1682     * if some axis are the same we chose z over y, y over x - the
   1683     * dx10 spec seems to ask for it while OpenGL doesn't care (if we
   1684     * wouldn't care could save a select or two if using different
   1685     * compares and doing at_g_as_ar last since tnewx and tnewz are the
   1686     * same).
   1687     */
   1688    as_ge_at = lp_build_cmp(coord_bld, PIPE_FUNC_GREATER, as, at);
   1689    maxasat = lp_build_max(coord_bld, as, at);
   1690    ar_ge_as_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, ar, maxasat);
   1691 
   1692    if (need_derivs && (derivs_in || (bld->no_quad_lod && bld->no_rho_approx))) {
   1693       /*
   1694        * XXX: This is really really complex.
   1695        * It is a bit overkill to use this for implicit derivatives as well,
   1696        * no way this is worth the cost in practice, but seems to be the
   1697        * only way for getting accurate and per-pixel lod values.
   1698        */
   1699       LLVMValueRef ima, imahalf, tmp, ddx[3], ddy[3];
   1700       LLVMValueRef madx, mady, madxdivma, madydivma;
   1701       LLVMValueRef sdxi, tdxi, rdxi, sdyi, tdyi, rdyi;
   1702       LLVMValueRef tdxnegi, rdxnegi, tdynegi, rdynegi;
   1703       LLVMValueRef sdxnewx, sdxnewy, sdxnewz, tdxnewx, tdxnewy, tdxnewz;
   1704       LLVMValueRef sdynewx, sdynewy, sdynewz, tdynewx, tdynewy, tdynewz;
   1705       LLVMValueRef face_sdx, face_tdx, face_sdy, face_tdy;
   1706       /*
   1707        * s = 1/2 * ( sc / ma + 1)
   1708        * t = 1/2 * ( tc / ma + 1)
   1709        *
   1710        * s' = 1/2 * (sc' * ma - sc * ma') / ma^2
   1711        * t' = 1/2 * (tc' * ma - tc * ma') / ma^2
   1712        *
   1713        * dx.s = 0.5 * (dx.sc - sc * dx.ma / ma) / ma
   1714        * dx.t = 0.5 * (dx.tc - tc * dx.ma / ma) / ma
   1715        * dy.s = 0.5 * (dy.sc - sc * dy.ma / ma) / ma
   1716        * dy.t = 0.5 * (dy.tc - tc * dy.ma / ma) / ma
   1717        */
   1718 
   1719       /* select ma, calculate ima */
   1720       ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r);
   1721       mai = LLVMBuildBitCast(builder, ma, cint_vec_type, "");
   1722       signmabit = LLVMBuildAnd(builder, mai, signmask, "");
   1723       ima = lp_build_div(coord_bld, coord_bld->one, ma);
   1724       imahalf = lp_build_mul(coord_bld, posHalf, ima);
   1725       imahalfpos = lp_build_abs(coord_bld, imahalf);
   1726 
   1727       if (!derivs_in) {
   1728          ddx[0] = lp_build_ddx(coord_bld, s);
   1729          ddx[1] = lp_build_ddx(coord_bld, t);
   1730          ddx[2] = lp_build_ddx(coord_bld, r);
   1731          ddy[0] = lp_build_ddy(coord_bld, s);
   1732          ddy[1] = lp_build_ddy(coord_bld, t);
   1733          ddy[2] = lp_build_ddy(coord_bld, r);
   1734       }
   1735       else {
   1736          ddx[0] = derivs_in->ddx[0];
   1737          ddx[1] = derivs_in->ddx[1];
   1738          ddx[2] = derivs_in->ddx[2];
   1739          ddy[0] = derivs_in->ddy[0];
   1740          ddy[1] = derivs_in->ddy[1];
   1741          ddy[2] = derivs_in->ddy[2];
   1742       }
   1743 
   1744       /* select major derivatives */
   1745       madx = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, ddx[0], ddx[1], ddx[2]);
   1746       mady = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, ddy[0], ddy[1], ddy[2]);
   1747 
   1748       si = LLVMBuildBitCast(builder, s, cint_vec_type, "");
   1749       ti = LLVMBuildBitCast(builder, t, cint_vec_type, "");
   1750       ri = LLVMBuildBitCast(builder, r, cint_vec_type, "");
   1751 
   1752       sdxi = LLVMBuildBitCast(builder, ddx[0], cint_vec_type, "");
   1753       tdxi = LLVMBuildBitCast(builder, ddx[1], cint_vec_type, "");
   1754       rdxi = LLVMBuildBitCast(builder, ddx[2], cint_vec_type, "");
   1755 
   1756       sdyi = LLVMBuildBitCast(builder, ddy[0], cint_vec_type, "");
   1757       tdyi = LLVMBuildBitCast(builder, ddy[1], cint_vec_type, "");
   1758       rdyi = LLVMBuildBitCast(builder, ddy[2], cint_vec_type, "");
   1759 
   1760       /*
   1761        * compute all possible new s/t coords, which does the mirroring,
   1762        * and do the same for derivs minor axes.
   1763        * snewx = signma * -r;
   1764        * tnewx = -t;
   1765        * snewy = s;
   1766        * tnewy = signma * r;
   1767        * snewz = signma * s;
   1768        * tnewz = -t;
   1769        */
   1770       tnegi = LLVMBuildXor(builder, ti, signmask, "");
   1771       rnegi = LLVMBuildXor(builder, ri, signmask, "");
   1772       tdxnegi = LLVMBuildXor(builder, tdxi, signmask, "");
   1773       rdxnegi = LLVMBuildXor(builder, rdxi, signmask, "");
   1774       tdynegi = LLVMBuildXor(builder, tdyi, signmask, "");
   1775       rdynegi = LLVMBuildXor(builder, rdyi, signmask, "");
   1776 
   1777       snewx = LLVMBuildXor(builder, signmabit, rnegi, "");
   1778       tnewx = tnegi;
   1779       sdxnewx = LLVMBuildXor(builder, signmabit, rdxnegi, "");
   1780       tdxnewx = tdxnegi;
   1781       sdynewx = LLVMBuildXor(builder, signmabit, rdynegi, "");
   1782       tdynewx = tdynegi;
   1783 
   1784       snewy = si;
   1785       tnewy = LLVMBuildXor(builder, signmabit, ri, "");
   1786       sdxnewy = sdxi;
   1787       tdxnewy = LLVMBuildXor(builder, signmabit, rdxi, "");
   1788       sdynewy = sdyi;
   1789       tdynewy = LLVMBuildXor(builder, signmabit, rdyi, "");
   1790 
   1791       snewz = LLVMBuildXor(builder, signmabit, si, "");
   1792       tnewz = tnegi;
   1793       sdxnewz = LLVMBuildXor(builder, signmabit, sdxi, "");
   1794       tdxnewz = tdxnegi;
   1795       sdynewz = LLVMBuildXor(builder, signmabit, sdyi, "");
   1796       tdynewz = tdynegi;
   1797 
   1798       /* select the mirrored values */
   1799       face = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, facex, facey, facez);
   1800       face_s = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, snewx, snewy, snewz);
   1801       face_t = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tnewx, tnewy, tnewz);
   1802       face_sdx = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, sdxnewx, sdxnewy, sdxnewz);
   1803       face_tdx = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tdxnewx, tdxnewy, tdxnewz);
   1804       face_sdy = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, sdynewx, sdynewy, sdynewz);
   1805       face_tdy = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tdynewx, tdynewy, tdynewz);
   1806 
   1807       face_s = LLVMBuildBitCast(builder, face_s, coord_vec_type, "");
   1808       face_t = LLVMBuildBitCast(builder, face_t, coord_vec_type, "");
   1809       face_sdx = LLVMBuildBitCast(builder, face_sdx, coord_vec_type, "");
   1810       face_tdx = LLVMBuildBitCast(builder, face_tdx, coord_vec_type, "");
   1811       face_sdy = LLVMBuildBitCast(builder, face_sdy, coord_vec_type, "");
   1812       face_tdy = LLVMBuildBitCast(builder, face_tdy, coord_vec_type, "");
   1813 
   1814       /* deriv math, dx.s = 0.5 * (dx.sc - sc * dx.ma / ma) / ma */
   1815       madxdivma = lp_build_mul(coord_bld, madx, ima);
   1816       tmp = lp_build_mul(coord_bld, madxdivma, face_s);
   1817       tmp = lp_build_sub(coord_bld, face_sdx, tmp);
   1818       derivs_out->ddx[0] = lp_build_mul(coord_bld, tmp, imahalf);
   1819 
   1820       /* dx.t = 0.5 * (dx.tc - tc * dx.ma / ma) / ma */
   1821       tmp = lp_build_mul(coord_bld, madxdivma, face_t);
   1822       tmp = lp_build_sub(coord_bld, face_tdx, tmp);
   1823       derivs_out->ddx[1] = lp_build_mul(coord_bld, tmp, imahalf);
   1824 
   1825       /* dy.s = 0.5 * (dy.sc - sc * dy.ma / ma) / ma */
   1826       madydivma = lp_build_mul(coord_bld, mady, ima);
   1827       tmp = lp_build_mul(coord_bld, madydivma, face_s);
   1828       tmp = lp_build_sub(coord_bld, face_sdy, tmp);
   1829       derivs_out->ddy[0] = lp_build_mul(coord_bld, tmp, imahalf);
   1830 
   1831       /* dy.t = 0.5 * (dy.tc - tc * dy.ma / ma) / ma */
   1832       tmp = lp_build_mul(coord_bld, madydivma, face_t);
   1833       tmp = lp_build_sub(coord_bld, face_tdy, tmp);
   1834       derivs_out->ddy[1] = lp_build_mul(coord_bld, tmp, imahalf);
   1835 
   1836       signma = LLVMBuildLShr(builder, mai, signshift, "");
   1837       coords[2] = LLVMBuildOr(builder, face, signma, "face");
   1838 
   1839       /* project coords */
   1840       face_s = lp_build_mul(coord_bld, face_s, imahalfpos);
   1841       face_t = lp_build_mul(coord_bld, face_t, imahalfpos);
   1842 
   1843       coords[0] = lp_build_add(coord_bld, face_s, posHalf);
   1844       coords[1] = lp_build_add(coord_bld, face_t, posHalf);
   1845 
   1846       return;
   1847    }
   1848 
   1849    else if (need_derivs) {
   1850       LLVMValueRef ddx_ddy[2], tmp[3], rho_vec;
   1851       static const unsigned char swizzle0[] = { /* no-op swizzle */
   1852          0, LP_BLD_SWIZZLE_DONTCARE,
   1853          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
   1854       };
   1855       static const unsigned char swizzle1[] = {
   1856          1, LP_BLD_SWIZZLE_DONTCARE,
   1857          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
   1858       };
   1859       static const unsigned char swizzle01[] = { /* no-op swizzle */
   1860          0, 1,
   1861          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
   1862       };
   1863       static const unsigned char swizzle23[] = {
   1864          2, 3,
   1865          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
   1866       };
   1867       static const unsigned char swizzle02[] = {
   1868          0, 2,
   1869          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
   1870       };
   1871 
   1872       /*
   1873        * scale the s/t/r coords pre-select/mirror so we can calculate
   1874        * "reasonable" derivs.
   1875        */
   1876       ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r);
   1877       imahalfpos = lp_build_cube_imapos(coord_bld, ma);
   1878       s = lp_build_mul(coord_bld, s, imahalfpos);
   1879       t = lp_build_mul(coord_bld, t, imahalfpos);
   1880       r = lp_build_mul(coord_bld, r, imahalfpos);
   1881 
   1882       /*
   1883        * This isn't quite the same as the "ordinary" (3d deriv) path since we
   1884        * know the texture is square which simplifies things (we can omit the
   1885        * size mul which happens very early completely here and do it at the
   1886        * very end).
   1887        * Also always do calculations according to GALLIVM_DEBUG_NO_RHO_APPROX
   1888        * since the error can get quite big otherwise at edges.
   1889        * (With no_rho_approx max error is sqrt(2) at edges, same as it is
   1890        * without no_rho_approx for 2d textures, otherwise it would be factor 2.)
   1891        */
   1892       ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
   1893       ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
   1894 
   1895       ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
   1896       ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
   1897 
   1898       tmp[0] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01);
   1899       tmp[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23);
   1900       tmp[2] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
   1901 
   1902       rho_vec = lp_build_add(coord_bld, tmp[0], tmp[1]);
   1903       rho_vec = lp_build_add(coord_bld, rho_vec, tmp[2]);
   1904 
   1905       tmp[0] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
   1906       tmp[1] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
   1907       *rho = lp_build_max(coord_bld, tmp[0], tmp[1]);
   1908    }
   1909 
   1910    if (!need_derivs) {
   1911       ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r);
   1912    }
   1913    mai = LLVMBuildBitCast(builder, ma, cint_vec_type, "");
   1914    signmabit = LLVMBuildAnd(builder, mai, signmask, "");
   1915 
   1916    si = LLVMBuildBitCast(builder, s, cint_vec_type, "");
   1917    ti = LLVMBuildBitCast(builder, t, cint_vec_type, "");
   1918    ri = LLVMBuildBitCast(builder, r, cint_vec_type, "");
   1919 
   1920    /*
   1921     * compute all possible new s/t coords, which does the mirroring
   1922     * snewx = signma * -r;
   1923     * tnewx = -t;
   1924     * snewy = s;
   1925     * tnewy = signma * r;
   1926     * snewz = signma * s;
   1927     * tnewz = -t;
   1928     */
   1929    tnegi = LLVMBuildXor(builder, ti, signmask, "");
   1930    rnegi = LLVMBuildXor(builder, ri, signmask, "");
   1931 
   1932    snewx = LLVMBuildXor(builder, signmabit, rnegi, "");
   1933    tnewx = tnegi;
   1934 
   1935    snewy = si;
   1936    tnewy = LLVMBuildXor(builder, signmabit, ri, "");
   1937 
   1938    snewz = LLVMBuildXor(builder, signmabit, si, "");
   1939    tnewz = tnegi;
   1940 
   1941    /* select the mirrored values */
   1942    face_s = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, snewx, snewy, snewz);
   1943    face_t = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tnewx, tnewy, tnewz);
   1944    face = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, facex, facey, facez);
   1945 
   1946    face_s = LLVMBuildBitCast(builder, face_s, coord_vec_type, "");
   1947    face_t = LLVMBuildBitCast(builder, face_t, coord_vec_type, "");
   1948 
   1949    /* add +1 for neg face */
   1950    /* XXX with AVX probably want to use another select here -
   1951     * as long as we ensure vblendvps gets used we can actually
   1952     * skip the comparison and just use sign as a "mask" directly.
   1953     */
   1954    signma = LLVMBuildLShr(builder, mai, signshift, "");
   1955    coords[2] = LLVMBuildOr(builder, face, signma, "face");
   1956 
   1957    /* project coords */
   1958    if (!need_derivs) {
   1959       imahalfpos = lp_build_cube_imapos(coord_bld, ma);
   1960       face_s = lp_build_mul(coord_bld, face_s, imahalfpos);
   1961       face_t = lp_build_mul(coord_bld, face_t, imahalfpos);
   1962    }
   1963 
   1964    coords[0] = lp_build_add(coord_bld, face_s, posHalf);
   1965    coords[1] = lp_build_add(coord_bld, face_t, posHalf);
   1966 }
   1967 
   1968 
   1969 /**
   1970  * Compute the partial offset of a pixel block along an arbitrary axis.
   1971  *
   1972  * @param coord   coordinate in pixels
   1973  * @param stride  number of bytes between rows of successive pixel blocks
   1974  * @param block_length  number of pixels in a pixels block along the coordinate
   1975  *                      axis
   1976  * @param out_offset    resulting relative offset of the pixel block in bytes
   1977  * @param out_subcoord  resulting sub-block pixel coordinate
   1978  */
   1979 void
   1980 lp_build_sample_partial_offset(struct lp_build_context *bld,
   1981                                unsigned block_length,
   1982                                LLVMValueRef coord,
   1983                                LLVMValueRef stride,
   1984                                LLVMValueRef *out_offset,
   1985                                LLVMValueRef *out_subcoord)
   1986 {
   1987    LLVMBuilderRef builder = bld->gallivm->builder;
   1988    LLVMValueRef offset;
   1989    LLVMValueRef subcoord;
   1990 
   1991    if (block_length == 1) {
   1992       subcoord = bld->zero;
   1993    }
   1994    else {
   1995       /*
   1996        * Pixel blocks have power of two dimensions. LLVM should convert the
   1997        * rem/div to bit arithmetic.
   1998        * TODO: Verify this.
   1999        * It does indeed BUT it does transform it to scalar (and back) when doing so
   2000        * (using roughly extract, shift/and, mov, unpack) (llvm 2.7).
   2001        * The generated code looks seriously unfunny and is quite expensive.
   2002        */
   2003 #if 0
   2004       LLVMValueRef block_width = lp_build_const_int_vec(bld->type, block_length);
   2005       subcoord = LLVMBuildURem(builder, coord, block_width, "");
   2006       coord    = LLVMBuildUDiv(builder, coord, block_width, "");
   2007 #else
   2008       unsigned logbase2 = util_logbase2(block_length);
   2009       LLVMValueRef block_shift = lp_build_const_int_vec(bld->gallivm, bld->type, logbase2);
   2010       LLVMValueRef block_mask = lp_build_const_int_vec(bld->gallivm, bld->type, block_length - 1);
   2011       subcoord = LLVMBuildAnd(builder, coord, block_mask, "");
   2012       coord = LLVMBuildLShr(builder, coord, block_shift, "");
   2013 #endif
   2014    }
   2015 
   2016    offset = lp_build_mul(bld, coord, stride);
   2017 
   2018    assert(out_offset);
   2019    assert(out_subcoord);
   2020 
   2021    *out_offset = offset;
   2022    *out_subcoord = subcoord;
   2023 }
   2024 
   2025 
   2026 /**
   2027  * Compute the offset of a pixel block.
   2028  *
   2029  * x, y, z, y_stride, z_stride are vectors, and they refer to pixels.
   2030  *
   2031  * Returns the relative offset and i,j sub-block coordinates
   2032  */
   2033 void
   2034 lp_build_sample_offset(struct lp_build_context *bld,
   2035                        const struct util_format_description *format_desc,
   2036                        LLVMValueRef x,
   2037                        LLVMValueRef y,
   2038                        LLVMValueRef z,
   2039                        LLVMValueRef y_stride,
   2040                        LLVMValueRef z_stride,
   2041                        LLVMValueRef *out_offset,
   2042                        LLVMValueRef *out_i,
   2043                        LLVMValueRef *out_j)
   2044 {
   2045    LLVMValueRef x_stride;
   2046    LLVMValueRef offset;
   2047 
   2048    x_stride = lp_build_const_vec(bld->gallivm, bld->type,
   2049                                  format_desc->block.bits/8);
   2050 
   2051    lp_build_sample_partial_offset(bld,
   2052                                   format_desc->block.width,
   2053                                   x, x_stride,
   2054                                   &offset, out_i);
   2055 
   2056    if (y && y_stride) {
   2057       LLVMValueRef y_offset;
   2058       lp_build_sample_partial_offset(bld,
   2059                                      format_desc->block.height,
   2060                                      y, y_stride,
   2061                                      &y_offset, out_j);
   2062       offset = lp_build_add(bld, offset, y_offset);
   2063    }
   2064    else {
   2065       *out_j = bld->zero;
   2066    }
   2067 
   2068    if (z && z_stride) {
   2069       LLVMValueRef z_offset;
   2070       LLVMValueRef k;
   2071       lp_build_sample_partial_offset(bld,
   2072                                      1, /* pixel blocks are always 2D */
   2073                                      z, z_stride,
   2074                                      &z_offset, &k);
   2075       offset = lp_build_add(bld, offset, z_offset);
   2076    }
   2077 
   2078    *out_offset = offset;
   2079 }
   2080