Home | History | Annotate | Download | only in llvmpipe
      1 /**************************************************************************
      2  *
      3  * Copyright 2009 VMware, Inc.
      4  * Copyright 2007 VMware, Inc.
      5  * All Rights Reserved.
      6  *
      7  * Permission is hereby granted, free of charge, to any person obtaining a
      8  * copy of this software and associated documentation files (the
      9  * "Software"), to deal in the Software without restriction, including
     10  * without limitation the rights to use, copy, modify, merge, publish,
     11  * distribute, sub license, and/or sell copies of the Software, and to
     12  * permit persons to whom the Software is furnished to do so, subject to
     13  * the following conditions:
     14  *
     15  * The above copyright notice and this permission notice (including the
     16  * next paragraph) shall be included in all copies or substantial portions
     17  * of the Software.
     18  *
     19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
     22  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
     23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     26  *
     27  **************************************************************************/
     28 
     29 /**
     30  * @file
     31  * Code generate the whole fragment pipeline.
     32  *
     33  * The fragment pipeline consists of the following stages:
     34  * - early depth test
     35  * - fragment shader
     36  * - alpha test
     37  * - depth/stencil test
     38  * - blending
     39  *
     40  * This file has only the glue to assemble the fragment pipeline.  The actual
     41  * plumbing of converting Gallium state into LLVM IR is done elsewhere, in the
     42  * lp_bld_*.[ch] files, and in a complete generic and reusable way. Here we
     43  * muster the LLVM JIT execution engine to create a function that follows an
     44  * established binary interface and that can be called from C directly.
     45  *
     46  * A big source of complexity here is that we often want to run different
     47  * stages with different precisions and data types and precisions. For example,
     48  * the fragment shader needs typically to be done in floats, but the
     49  * depth/stencil test and blending is better done in the type that most closely
     50  * matches the depth/stencil and color buffer respectively.
     51  *
     52  * Since the width of a SIMD vector register stays the same regardless of the
     53  * element type, different types imply different number of elements, so we must
     54  * code generate more instances of the stages with larger types to be able to
     55  * feed/consume the stages with smaller types.
     56  *
     57  * @author Jose Fonseca <jfonseca (at) vmware.com>
     58  */
     59 
     60 #include <limits.h>
     61 #include "pipe/p_defines.h"
     62 #include "util/u_inlines.h"
     63 #include "util/u_memory.h"
     64 #include "util/u_pointer.h"
     65 #include "util/u_format.h"
     66 #include "util/u_dump.h"
     67 #include "util/u_string.h"
     68 #include "util/simple_list.h"
     69 #include "util/u_dual_blend.h"
     70 #include "util/os_time.h"
     71 #include "pipe/p_shader_tokens.h"
     72 #include "draw/draw_context.h"
     73 #include "tgsi/tgsi_dump.h"
     74 #include "tgsi/tgsi_scan.h"
     75 #include "tgsi/tgsi_parse.h"
     76 #include "gallivm/lp_bld_type.h"
     77 #include "gallivm/lp_bld_const.h"
     78 #include "gallivm/lp_bld_conv.h"
     79 #include "gallivm/lp_bld_init.h"
     80 #include "gallivm/lp_bld_intr.h"
     81 #include "gallivm/lp_bld_logic.h"
     82 #include "gallivm/lp_bld_tgsi.h"
     83 #include "gallivm/lp_bld_swizzle.h"
     84 #include "gallivm/lp_bld_flow.h"
     85 #include "gallivm/lp_bld_debug.h"
     86 #include "gallivm/lp_bld_arit.h"
     87 #include "gallivm/lp_bld_bitarit.h"
     88 #include "gallivm/lp_bld_pack.h"
     89 #include "gallivm/lp_bld_format.h"
     90 #include "gallivm/lp_bld_quad.h"
     91 
     92 #include "lp_bld_alpha.h"
     93 #include "lp_bld_blend.h"
     94 #include "lp_bld_depth.h"
     95 #include "lp_bld_interp.h"
     96 #include "lp_context.h"
     97 #include "lp_debug.h"
     98 #include "lp_perf.h"
     99 #include "lp_setup.h"
    100 #include "lp_state.h"
    101 #include "lp_tex_sample.h"
    102 #include "lp_flush.h"
    103 #include "lp_state_fs.h"
    104 #include "lp_rast.h"
    105 
    106 
    107 /** Fragment shader number (for debugging) */
    108 static unsigned fs_no = 0;
    109 
    110 
    111 /**
    112  * Expand the relevant bits of mask_input to a n*4-dword mask for the
    113  * n*four pixels in n 2x2 quads.  This will set the n*four elements of the
    114  * quad mask vector to 0 or ~0.
    115  * Grouping is 01, 23 for 2 quad mode hence only 0 and 2 are valid
    116  * quad arguments with fs length 8.
    117  *
    118  * \param first_quad  which quad(s) of the quad group to test, in [0,3]
    119  * \param mask_input  bitwise mask for the whole 4x4 stamp
    120  */
    121 static LLVMValueRef
    122 generate_quad_mask(struct gallivm_state *gallivm,
    123                    struct lp_type fs_type,
    124                    unsigned first_quad,
    125                    LLVMValueRef mask_input) /* int32 */
    126 {
    127    LLVMBuilderRef builder = gallivm->builder;
    128    struct lp_type mask_type;
    129    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
    130    LLVMValueRef bits[16];
    131    LLVMValueRef mask, bits_vec;
    132    int shift, i;
    133 
    134    /*
    135     * XXX: We'll need a different path for 16 x u8
    136     */
    137    assert(fs_type.width == 32);
    138    assert(fs_type.length <= ARRAY_SIZE(bits));
    139    mask_type = lp_int_type(fs_type);
    140 
    141    /*
    142     * mask_input >>= (quad * 4)
    143     */
    144    switch (first_quad) {
    145    case 0:
    146       shift = 0;
    147       break;
    148    case 1:
    149       assert(fs_type.length == 4);
    150       shift = 2;
    151       break;
    152    case 2:
    153       shift = 8;
    154       break;
    155    case 3:
    156       assert(fs_type.length == 4);
    157       shift = 10;
    158       break;
    159    default:
    160       assert(0);
    161       shift = 0;
    162    }
    163 
    164    mask_input = LLVMBuildLShr(builder,
    165                               mask_input,
    166                               LLVMConstInt(i32t, shift, 0),
    167                               "");
    168 
    169    /*
    170     * mask = { mask_input & (1 << i), for i in [0,3] }
    171     */
    172    mask = lp_build_broadcast(gallivm,
    173                              lp_build_vec_type(gallivm, mask_type),
    174                              mask_input);
    175 
    176    for (i = 0; i < fs_type.length / 4; i++) {
    177       unsigned j = 2 * (i % 2) + (i / 2) * 8;
    178       bits[4*i + 0] = LLVMConstInt(i32t, 1ULL << (j + 0), 0);
    179       bits[4*i + 1] = LLVMConstInt(i32t, 1ULL << (j + 1), 0);
    180       bits[4*i + 2] = LLVMConstInt(i32t, 1ULL << (j + 4), 0);
    181       bits[4*i + 3] = LLVMConstInt(i32t, 1ULL << (j + 5), 0);
    182    }
    183    bits_vec = LLVMConstVector(bits, fs_type.length);
    184    mask = LLVMBuildAnd(builder, mask, bits_vec, "");
    185 
    186    /*
    187     * mask = mask == bits ? ~0 : 0
    188     */
    189    mask = lp_build_compare(gallivm,
    190                            mask_type, PIPE_FUNC_EQUAL,
    191                            mask, bits_vec);
    192 
    193    return mask;
    194 }
    195 
    196 
    197 #define EARLY_DEPTH_TEST  0x1
    198 #define LATE_DEPTH_TEST   0x2
    199 #define EARLY_DEPTH_WRITE 0x4
    200 #define LATE_DEPTH_WRITE  0x8
    201 
    202 static int
    203 find_output_by_semantic( const struct tgsi_shader_info *info,
    204 			 unsigned semantic,
    205 			 unsigned index )
    206 {
    207    int i;
    208 
    209    for (i = 0; i < info->num_outputs; i++)
    210       if (info->output_semantic_name[i] == semantic &&
    211 	  info->output_semantic_index[i] == index)
    212 	 return i;
    213 
    214    return -1;
    215 }
    216 
    217 
    218 /**
    219  * Fetch the specified lp_jit_viewport structure for a given viewport_index.
    220  */
    221 static LLVMValueRef
    222 lp_llvm_viewport(LLVMValueRef context_ptr,
    223                  struct gallivm_state *gallivm,
    224                  LLVMValueRef viewport_index)
    225 {
    226    LLVMBuilderRef builder = gallivm->builder;
    227    LLVMValueRef ptr;
    228    LLVMValueRef res;
    229    struct lp_type viewport_type =
    230       lp_type_float_vec(32, 32 * LP_JIT_VIEWPORT_NUM_FIELDS);
    231 
    232    ptr = lp_jit_context_viewports(gallivm, context_ptr);
    233    ptr = LLVMBuildPointerCast(builder, ptr,
    234             LLVMPointerType(lp_build_vec_type(gallivm, viewport_type), 0), "");
    235 
    236    res = lp_build_pointer_get(builder, ptr, viewport_index);
    237 
    238    return res;
    239 }
    240 
    241 
    242 static LLVMValueRef
    243 lp_build_depth_clamp(struct gallivm_state *gallivm,
    244                      LLVMBuilderRef builder,
    245                      struct lp_type type,
    246                      LLVMValueRef context_ptr,
    247                      LLVMValueRef thread_data_ptr,
    248                      LLVMValueRef z)
    249 {
    250    LLVMValueRef viewport, min_depth, max_depth;
    251    LLVMValueRef viewport_index;
    252    struct lp_build_context f32_bld;
    253 
    254    assert(type.floating);
    255    lp_build_context_init(&f32_bld, gallivm, type);
    256 
    257    /*
    258     * Assumes clamping of the viewport index will occur in setup/gs. Value
    259     * is passed through the rasterization stage via lp_rast_shader_inputs.
    260     *
    261     * See: draw_clamp_viewport_idx and lp_clamp_viewport_idx for clamping
    262     *      semantics.
    263     */
    264    viewport_index = lp_jit_thread_data_raster_state_viewport_index(gallivm,
    265                        thread_data_ptr);
    266 
    267    /*
    268     * Load the min and max depth from the lp_jit_context.viewports
    269     * array of lp_jit_viewport structures.
    270     */
    271    viewport = lp_llvm_viewport(context_ptr, gallivm, viewport_index);
    272 
    273    /* viewports[viewport_index].min_depth */
    274    min_depth = LLVMBuildExtractElement(builder, viewport,
    275                   lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MIN_DEPTH), "");
    276    min_depth = lp_build_broadcast_scalar(&f32_bld, min_depth);
    277 
    278    /* viewports[viewport_index].max_depth */
    279    max_depth = LLVMBuildExtractElement(builder, viewport,
    280                   lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MAX_DEPTH), "");
    281    max_depth = lp_build_broadcast_scalar(&f32_bld, max_depth);
    282 
    283    /*
    284     * Clamp to the min and max depth values for the given viewport.
    285     */
    286    return lp_build_clamp(&f32_bld, z, min_depth, max_depth);
    287 }
    288 
    289 
    290 /**
    291  * Generate the fragment shader, depth/stencil test, and alpha tests.
    292  */
    293 static void
    294 generate_fs_loop(struct gallivm_state *gallivm,
    295                  struct lp_fragment_shader *shader,
    296                  const struct lp_fragment_shader_variant_key *key,
    297                  LLVMBuilderRef builder,
    298                  struct lp_type type,
    299                  LLVMValueRef context_ptr,
    300                  LLVMValueRef num_loop,
    301                  struct lp_build_interp_soa_context *interp,
    302                  struct lp_build_sampler_soa *sampler,
    303                  LLVMValueRef mask_store,
    304                  LLVMValueRef (*out_color)[4],
    305                  LLVMValueRef depth_ptr,
    306                  LLVMValueRef depth_stride,
    307                  LLVMValueRef facing,
    308                  LLVMValueRef thread_data_ptr)
    309 {
    310    const struct util_format_description *zs_format_desc = NULL;
    311    const struct tgsi_token *tokens = shader->base.tokens;
    312    struct lp_type int_type = lp_int_type(type);
    313    LLVMTypeRef vec_type, int_vec_type;
    314    LLVMValueRef mask_ptr, mask_val;
    315    LLVMValueRef consts_ptr, num_consts_ptr;
    316    LLVMValueRef z;
    317    LLVMValueRef z_value, s_value;
    318    LLVMValueRef z_fb, s_fb;
    319    LLVMValueRef stencil_refs[2];
    320    LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS];
    321    struct lp_build_for_loop_state loop_state;
    322    struct lp_build_mask_context mask;
    323    /*
    324     * TODO: figure out if simple_shader optimization is really worthwile to
    325     * keep. Disabled because it may hide some real bugs in the (depth/stencil)
    326     * code since tests tend to take another codepath than real shaders.
    327     */
    328    boolean simple_shader = (shader->info.base.file_count[TGSI_FILE_SAMPLER] == 0 &&
    329                             shader->info.base.num_inputs < 3 &&
    330                             shader->info.base.num_instructions < 8) && 0;
    331    const boolean dual_source_blend = key->blend.rt[0].blend_enable &&
    332                                      util_blend_state_is_dual(&key->blend, 0);
    333    unsigned attrib;
    334    unsigned chan;
    335    unsigned cbuf;
    336    unsigned depth_mode;
    337 
    338    struct lp_bld_tgsi_system_values system_values;
    339 
    340    memset(&system_values, 0, sizeof(system_values));
    341 
    342    if (key->depth.enabled ||
    343        key->stencil[0].enabled) {
    344 
    345       zs_format_desc = util_format_description(key->zsbuf_format);
    346       assert(zs_format_desc);
    347 
    348       if (!shader->info.base.writes_z && !shader->info.base.writes_stencil) {
    349          if (key->alpha.enabled ||
    350              key->blend.alpha_to_coverage ||
    351              shader->info.base.uses_kill ||
    352              shader->info.base.writes_samplemask) {
    353             /* With alpha test and kill, can do the depth test early
    354              * and hopefully eliminate some quads.  But need to do a
    355              * special deferred depth write once the final mask value
    356              * is known. This only works though if there's either no
    357              * stencil test or the stencil value isn't written.
    358              */
    359             if (key->stencil[0].enabled && (key->stencil[0].writemask ||
    360                                             (key->stencil[1].enabled &&
    361                                              key->stencil[1].writemask)))
    362                depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
    363             else
    364                depth_mode = EARLY_DEPTH_TEST | LATE_DEPTH_WRITE;
    365          }
    366          else
    367             depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE;
    368       }
    369       else {
    370          depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
    371       }
    372 
    373       if (!(key->depth.enabled && key->depth.writemask) &&
    374           !(key->stencil[0].enabled && (key->stencil[0].writemask ||
    375                                         (key->stencil[1].enabled &&
    376                                          key->stencil[1].writemask))))
    377          depth_mode &= ~(LATE_DEPTH_WRITE | EARLY_DEPTH_WRITE);
    378    }
    379    else {
    380       depth_mode = 0;
    381    }
    382 
    383    vec_type = lp_build_vec_type(gallivm, type);
    384    int_vec_type = lp_build_vec_type(gallivm, int_type);
    385 
    386    stencil_refs[0] = lp_jit_context_stencil_ref_front_value(gallivm, context_ptr);
    387    stencil_refs[1] = lp_jit_context_stencil_ref_back_value(gallivm, context_ptr);
    388    /* convert scalar stencil refs into vectors */
    389    stencil_refs[0] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[0]);
    390    stencil_refs[1] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[1]);
    391 
    392    consts_ptr = lp_jit_context_constants(gallivm, context_ptr);
    393    num_consts_ptr = lp_jit_context_num_constants(gallivm, context_ptr);
    394 
    395    lp_build_for_loop_begin(&loop_state, gallivm,
    396                            lp_build_const_int32(gallivm, 0),
    397                            LLVMIntULT,
    398                            num_loop,
    399                            lp_build_const_int32(gallivm, 1));
    400 
    401    mask_ptr = LLVMBuildGEP(builder, mask_store,
    402                            &loop_state.counter, 1, "mask_ptr");
    403    mask_val = LLVMBuildLoad(builder, mask_ptr, "");
    404 
    405    memset(outputs, 0, sizeof outputs);
    406 
    407    for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
    408       for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
    409          out_color[cbuf][chan] = lp_build_array_alloca(gallivm,
    410                                                        lp_build_vec_type(gallivm,
    411                                                                          type),
    412                                                        num_loop, "color");
    413       }
    414    }
    415    if (dual_source_blend) {
    416       assert(key->nr_cbufs <= 1);
    417       for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
    418          out_color[1][chan] = lp_build_array_alloca(gallivm,
    419                                                     lp_build_vec_type(gallivm,
    420                                                                       type),
    421                                                     num_loop, "color1");
    422       }
    423    }
    424 
    425 
    426    /* 'mask' will control execution based on quad's pixel alive/killed state */
    427    lp_build_mask_begin(&mask, gallivm, type, mask_val);
    428 
    429    if (!(depth_mode & EARLY_DEPTH_TEST) && !simple_shader)
    430       lp_build_mask_check(&mask);
    431 
    432    lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter);
    433    z = interp->pos[2];
    434 
    435    if (depth_mode & EARLY_DEPTH_TEST) {
    436       /*
    437        * Clamp according to ARB_depth_clamp semantics.
    438        */
    439       if (key->depth_clamp) {
    440          z = lp_build_depth_clamp(gallivm, builder, type, context_ptr,
    441                                   thread_data_ptr, z);
    442       }
    443       lp_build_depth_stencil_load_swizzled(gallivm, type,
    444                                            zs_format_desc, key->resource_1d,
    445                                            depth_ptr, depth_stride,
    446                                            &z_fb, &s_fb, loop_state.counter);
    447       lp_build_depth_stencil_test(gallivm,
    448                                   &key->depth,
    449                                   key->stencil,
    450                                   type,
    451                                   zs_format_desc,
    452                                   &mask,
    453                                   stencil_refs,
    454                                   z, z_fb, s_fb,
    455                                   facing,
    456                                   &z_value, &s_value,
    457                                   !simple_shader);
    458 
    459       if (depth_mode & EARLY_DEPTH_WRITE) {
    460          lp_build_depth_stencil_write_swizzled(gallivm, type,
    461                                                zs_format_desc, key->resource_1d,
    462                                                NULL, NULL, NULL, loop_state.counter,
    463                                                depth_ptr, depth_stride,
    464                                                z_value, s_value);
    465       }
    466       /*
    467        * Note mask check if stencil is enabled must be after ds write not after
    468        * stencil test otherwise new stencil values may not get written if all
    469        * fragments got killed by depth/stencil test.
    470        */
    471       if (!simple_shader && key->stencil[0].enabled)
    472          lp_build_mask_check(&mask);
    473    }
    474 
    475    lp_build_interp_soa_update_inputs_dyn(interp, gallivm, loop_state.counter);
    476 
    477    /* Build the actual shader */
    478    lp_build_tgsi_soa(gallivm, tokens, type, &mask,
    479                      consts_ptr, num_consts_ptr, &system_values,
    480                      interp->inputs,
    481                      outputs, context_ptr, thread_data_ptr,
    482                      sampler, &shader->info.base, NULL);
    483 
    484    /* Alpha test */
    485    if (key->alpha.enabled) {
    486       int color0 = find_output_by_semantic(&shader->info.base,
    487                                            TGSI_SEMANTIC_COLOR,
    488                                            0);
    489 
    490       if (color0 != -1 && outputs[color0][3]) {
    491          const struct util_format_description *cbuf_format_desc;
    492          LLVMValueRef alpha = LLVMBuildLoad(builder, outputs[color0][3], "alpha");
    493          LLVMValueRef alpha_ref_value;
    494 
    495          alpha_ref_value = lp_jit_context_alpha_ref_value(gallivm, context_ptr);
    496          alpha_ref_value = lp_build_broadcast(gallivm, vec_type, alpha_ref_value);
    497 
    498          cbuf_format_desc = util_format_description(key->cbuf_format[0]);
    499 
    500          lp_build_alpha_test(gallivm, key->alpha.func, type, cbuf_format_desc,
    501                              &mask, alpha, alpha_ref_value,
    502                              (depth_mode & LATE_DEPTH_TEST) != 0);
    503       }
    504    }
    505 
    506    /* Emulate Alpha to Coverage with Alpha test */
    507    if (key->blend.alpha_to_coverage) {
    508       int color0 = find_output_by_semantic(&shader->info.base,
    509                                            TGSI_SEMANTIC_COLOR,
    510                                            0);
    511 
    512       if (color0 != -1 && outputs[color0][3]) {
    513          LLVMValueRef alpha = LLVMBuildLoad(builder, outputs[color0][3], "alpha");
    514 
    515          lp_build_alpha_to_coverage(gallivm, type,
    516                                     &mask, alpha,
    517                                     (depth_mode & LATE_DEPTH_TEST) != 0);
    518       }
    519    }
    520 
    521    if (shader->info.base.writes_samplemask) {
    522       int smaski = find_output_by_semantic(&shader->info.base,
    523                                            TGSI_SEMANTIC_SAMPLEMASK,
    524                                            0);
    525       LLVMValueRef smask;
    526       struct lp_build_context smask_bld;
    527       lp_build_context_init(&smask_bld, gallivm, int_type);
    528 
    529       assert(smaski >= 0);
    530       smask = LLVMBuildLoad(builder, outputs[smaski][0], "smask");
    531       /*
    532        * Pixel is alive according to the first sample in the mask.
    533        */
    534       smask = LLVMBuildBitCast(builder, smask, smask_bld.vec_type, "");
    535       smask = lp_build_and(&smask_bld, smask, smask_bld.one);
    536       smask = lp_build_cmp(&smask_bld, PIPE_FUNC_NOTEQUAL, smask, smask_bld.zero);
    537       lp_build_mask_update(&mask, smask);
    538    }
    539 
    540    /* Late Z test */
    541    if (depth_mode & LATE_DEPTH_TEST) {
    542       int pos0 = find_output_by_semantic(&shader->info.base,
    543                                          TGSI_SEMANTIC_POSITION,
    544                                          0);
    545       int s_out = find_output_by_semantic(&shader->info.base,
    546                                           TGSI_SEMANTIC_STENCIL,
    547                                           0);
    548       if (pos0 != -1 && outputs[pos0][2]) {
    549          z = LLVMBuildLoad(builder, outputs[pos0][2], "output.z");
    550       }
    551       /*
    552        * Clamp according to ARB_depth_clamp semantics.
    553        */
    554       if (key->depth_clamp) {
    555          z = lp_build_depth_clamp(gallivm, builder, type, context_ptr,
    556                                   thread_data_ptr, z);
    557       }
    558 
    559       if (s_out != -1 && outputs[s_out][1]) {
    560          /* there's only one value, and spec says to discard additional bits */
    561          LLVMValueRef s_max_mask = lp_build_const_int_vec(gallivm, int_type, 255);
    562          stencil_refs[0] = LLVMBuildLoad(builder, outputs[s_out][1], "output.s");
    563          stencil_refs[0] = LLVMBuildBitCast(builder, stencil_refs[0], int_vec_type, "");
    564          stencil_refs[0] = LLVMBuildAnd(builder, stencil_refs[0], s_max_mask, "");
    565          stencil_refs[1] = stencil_refs[0];
    566       }
    567 
    568       lp_build_depth_stencil_load_swizzled(gallivm, type,
    569                                            zs_format_desc, key->resource_1d,
    570                                            depth_ptr, depth_stride,
    571                                            &z_fb, &s_fb, loop_state.counter);
    572 
    573       lp_build_depth_stencil_test(gallivm,
    574                                   &key->depth,
    575                                   key->stencil,
    576                                   type,
    577                                   zs_format_desc,
    578                                   &mask,
    579                                   stencil_refs,
    580                                   z, z_fb, s_fb,
    581                                   facing,
    582                                   &z_value, &s_value,
    583                                   !simple_shader);
    584       /* Late Z write */
    585       if (depth_mode & LATE_DEPTH_WRITE) {
    586          lp_build_depth_stencil_write_swizzled(gallivm, type,
    587                                                zs_format_desc, key->resource_1d,
    588                                                NULL, NULL, NULL, loop_state.counter,
    589                                                depth_ptr, depth_stride,
    590                                                z_value, s_value);
    591       }
    592    }
    593    else if ((depth_mode & EARLY_DEPTH_TEST) &&
    594             (depth_mode & LATE_DEPTH_WRITE))
    595    {
    596       /* Need to apply a reduced mask to the depth write.  Reload the
    597        * depth value, update from zs_value with the new mask value and
    598        * write that out.
    599        */
    600       lp_build_depth_stencil_write_swizzled(gallivm, type,
    601                                             zs_format_desc, key->resource_1d,
    602                                             &mask, z_fb, s_fb, loop_state.counter,
    603                                             depth_ptr, depth_stride,
    604                                             z_value, s_value);
    605    }
    606 
    607 
    608    /* Color write  */
    609    for (attrib = 0; attrib < shader->info.base.num_outputs; ++attrib)
    610    {
    611       unsigned cbuf = shader->info.base.output_semantic_index[attrib];
    612       if ((shader->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_COLOR) &&
    613            ((cbuf < key->nr_cbufs) || (cbuf == 1 && dual_source_blend)))
    614       {
    615          for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
    616             if(outputs[attrib][chan]) {
    617                /* XXX: just initialize outputs to point at colors[] and
    618                 * skip this.
    619                 */
    620                LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], "");
    621                LLVMValueRef color_ptr;
    622                color_ptr = LLVMBuildGEP(builder, out_color[cbuf][chan],
    623                                         &loop_state.counter, 1, "");
    624                lp_build_name(out, "color%u.%c", attrib, "rgba"[chan]);
    625                LLVMBuildStore(builder, out, color_ptr);
    626             }
    627          }
    628       }
    629    }
    630 
    631    if (key->occlusion_count) {
    632       LLVMValueRef counter = lp_jit_thread_data_counter(gallivm, thread_data_ptr);
    633       lp_build_name(counter, "counter");
    634       lp_build_occlusion_count(gallivm, type,
    635                                lp_build_mask_value(&mask), counter);
    636    }
    637 
    638    mask_val = lp_build_mask_end(&mask);
    639    LLVMBuildStore(builder, mask_val, mask_ptr);
    640    lp_build_for_loop_end(&loop_state);
    641 }
    642 
    643 
    644 /**
    645  * This function will reorder pixels from the fragment shader SoA to memory layout AoS
    646  *
    647  * Fragment Shader outputs pixels in small 2x2 blocks
    648  *  e.g. (0, 0), (1, 0), (0, 1), (1, 1) ; (2, 0) ...
    649  *
    650  * However in memory pixels are stored in rows
    651  *  e.g. (0, 0), (1, 0), (2, 0), (3, 0) ; (0, 1) ...
    652  *
    653  * @param type            fragment shader type (4x or 8x float)
    654  * @param num_fs          number of fs_src
    655  * @param is_1d           whether we're outputting to a 1d resource
    656  * @param dst_channels    number of output channels
    657  * @param fs_src          output from fragment shader
    658  * @param dst             pointer to store result
    659  * @param pad_inline      is channel padding inline or at end of row
    660  * @return                the number of dsts
    661  */
    662 static int
    663 generate_fs_twiddle(struct gallivm_state *gallivm,
    664                     struct lp_type type,
    665                     unsigned num_fs,
    666                     unsigned dst_channels,
    667                     LLVMValueRef fs_src[][4],
    668                     LLVMValueRef* dst,
    669                     bool pad_inline)
    670 {
    671    LLVMValueRef src[16];
    672 
    673    bool swizzle_pad;
    674    bool twiddle;
    675    bool split;
    676 
    677    unsigned pixels = type.length / 4;
    678    unsigned reorder_group;
    679    unsigned src_channels;
    680    unsigned src_count;
    681    unsigned i;
    682 
    683    src_channels = dst_channels < 3 ? dst_channels : 4;
    684    src_count = num_fs * src_channels;
    685 
    686    assert(pixels == 2 || pixels == 1);
    687    assert(num_fs * src_channels <= ARRAY_SIZE(src));
    688 
    689    /*
    690     * Transpose from SoA -> AoS
    691     */
    692    for (i = 0; i < num_fs; ++i) {
    693       lp_build_transpose_aos_n(gallivm, type, &fs_src[i][0], src_channels, &src[i * src_channels]);
    694    }
    695 
    696    /*
    697     * Pick transformation options
    698     */
    699    swizzle_pad = false;
    700    twiddle = false;
    701    split = false;
    702    reorder_group = 0;
    703 
    704    if (dst_channels == 1) {
    705       twiddle = true;
    706 
    707       if (pixels == 2) {
    708          split = true;
    709       }
    710    } else if (dst_channels == 2) {
    711       if (pixels == 1) {
    712          reorder_group = 1;
    713       }
    714    } else if (dst_channels > 2) {
    715       if (pixels == 1) {
    716          reorder_group = 2;
    717       } else {
    718          twiddle = true;
    719       }
    720 
    721       if (!pad_inline && dst_channels == 3 && pixels > 1) {
    722          swizzle_pad = true;
    723       }
    724    }
    725 
    726    /*
    727     * Split the src in half
    728     */
    729    if (split) {
    730       for (i = num_fs; i > 0; --i) {
    731          src[(i - 1)*2 + 1] = lp_build_extract_range(gallivm, src[i - 1], 4, 4);
    732          src[(i - 1)*2 + 0] = lp_build_extract_range(gallivm, src[i - 1], 0, 4);
    733       }
    734 
    735       src_count *= 2;
    736       type.length = 4;
    737    }
    738 
    739    /*
    740     * Ensure pixels are in memory order
    741     */
    742    if (reorder_group) {
    743       /* Twiddle pixels by reordering the array, e.g.:
    744        *
    745        * src_count =  8 -> 0 2 1 3 4 6 5 7
    746        * src_count = 16 -> 0 1 4 5 2 3 6 7 8 9 12 13 10 11 14 15
    747        */
    748       const unsigned reorder_sw[] = { 0, 2, 1, 3 };
    749 
    750       for (i = 0; i < src_count; ++i) {
    751          unsigned group = i / reorder_group;
    752          unsigned block = (group / 4) * 4 * reorder_group;
    753          unsigned j = block + (reorder_sw[group % 4] * reorder_group) + (i % reorder_group);
    754          dst[i] = src[j];
    755       }
    756    } else if (twiddle) {
    757       /* Twiddle pixels across elements of array */
    758       /*
    759        * XXX: we should avoid this in some cases, but would need to tell
    760        * lp_build_conv to reorder (or deal with it ourselves).
    761        */
    762       lp_bld_quad_twiddle(gallivm, type, src, src_count, dst);
    763    } else {
    764       /* Do nothing */
    765       memcpy(dst, src, sizeof(LLVMValueRef) * src_count);
    766    }
    767 
    768    /*
    769     * Moves any padding between pixels to the end
    770     * e.g. RGBXRGBX -> RGBRGBXX
    771     */
    772    if (swizzle_pad) {
    773       unsigned char swizzles[16];
    774       unsigned elems = pixels * dst_channels;
    775 
    776       for (i = 0; i < type.length; ++i) {
    777          if (i < elems)
    778             swizzles[i] = i % dst_channels + (i / dst_channels) * 4;
    779          else
    780             swizzles[i] = LP_BLD_SWIZZLE_DONTCARE;
    781       }
    782 
    783       for (i = 0; i < src_count; ++i) {
    784          dst[i] = lp_build_swizzle_aos_n(gallivm, dst[i], swizzles, type.length, type.length);
    785       }
    786    }
    787 
    788    return src_count;
    789 }
    790 
    791 
    792 /*
    793  * Untwiddle and transpose, much like the above.
    794  * However, this is after conversion, so we get packed vectors.
    795  * At this time only handle 4x16i8 rgba / 2x16i8 rg / 1x16i8 r data,
    796  * the vectors will look like:
    797  * r0r1r4r5r2r3r6r7r8r9r12... (albeit color channels may
    798  * be swizzled here). Extending to 16bit should be trivial.
    799  * Should also be extended to handle twice wide vectors with AVX2...
    800  */
    801 static void
    802 fs_twiddle_transpose(struct gallivm_state *gallivm,
    803                      struct lp_type type,
    804                      LLVMValueRef *src,
    805                      unsigned src_count,
    806                      LLVMValueRef *dst)
    807 {
    808    unsigned i, j;
    809    struct lp_type type64, type16, type32;
    810    LLVMTypeRef type64_t, type8_t, type16_t, type32_t;
    811    LLVMBuilderRef builder = gallivm->builder;
    812    LLVMValueRef tmp[4], shuf[8];
    813    for (j = 0; j < 2; j++) {
    814       shuf[j*4 + 0] = lp_build_const_int32(gallivm, j*4 + 0);
    815       shuf[j*4 + 1] = lp_build_const_int32(gallivm, j*4 + 2);
    816       shuf[j*4 + 2] = lp_build_const_int32(gallivm, j*4 + 1);
    817       shuf[j*4 + 3] = lp_build_const_int32(gallivm, j*4 + 3);
    818    }
    819 
    820    assert(src_count == 4 || src_count == 2 || src_count == 1);
    821    assert(type.width == 8);
    822    assert(type.length == 16);
    823 
    824    type8_t = lp_build_vec_type(gallivm, type);
    825 
    826    type64 = type;
    827    type64.length /= 8;
    828    type64.width *= 8;
    829    type64_t = lp_build_vec_type(gallivm, type64);
    830 
    831    type16 = type;
    832    type16.length /= 2;
    833    type16.width *= 2;
    834    type16_t = lp_build_vec_type(gallivm, type16);
    835 
    836    type32 = type;
    837    type32.length /= 4;
    838    type32.width *= 4;
    839    type32_t = lp_build_vec_type(gallivm, type32);
    840 
    841    lp_build_transpose_aos_n(gallivm, type, src, src_count, tmp);
    842 
    843    if (src_count == 1) {
    844       /* transpose was no-op, just untwiddle */
    845       LLVMValueRef shuf_vec;
    846       shuf_vec = LLVMConstVector(shuf, 8);
    847       tmp[0] = LLVMBuildBitCast(builder, src[0], type16_t, "");
    848       tmp[0] = LLVMBuildShuffleVector(builder, tmp[0], tmp[0], shuf_vec, "");
    849       dst[0] = LLVMBuildBitCast(builder, tmp[0], type8_t, "");
    850    } else if (src_count == 2) {
    851       LLVMValueRef shuf_vec;
    852       shuf_vec = LLVMConstVector(shuf, 4);
    853 
    854       for (i = 0; i < 2; i++) {
    855          tmp[i] = LLVMBuildBitCast(builder, tmp[i], type32_t, "");
    856          tmp[i] = LLVMBuildShuffleVector(builder, tmp[i], tmp[i], shuf_vec, "");
    857          dst[i] = LLVMBuildBitCast(builder, tmp[i], type8_t, "");
    858       }
    859    } else {
    860       for (j = 0; j < 2; j++) {
    861          LLVMValueRef lo, hi, lo2, hi2;
    862           /*
    863           * Note that if we only really have 3 valid channels (rgb)
    864           * and we don't need alpha we could substitute a undef here
    865           * for the respective channel (causing llvm to drop conversion
    866           * for alpha).
    867           */
    868          /* we now have rgba0rgba1rgba4rgba5 etc, untwiddle */
    869          lo2 = LLVMBuildBitCast(builder, tmp[j*2], type64_t, "");
    870          hi2 = LLVMBuildBitCast(builder, tmp[j*2 + 1], type64_t, "");
    871          lo = lp_build_interleave2(gallivm, type64, lo2, hi2, 0);
    872          hi = lp_build_interleave2(gallivm, type64, lo2, hi2, 1);
    873          dst[j*2] = LLVMBuildBitCast(builder, lo, type8_t, "");
    874          dst[j*2 + 1] = LLVMBuildBitCast(builder, hi, type8_t, "");
    875       }
    876    }
    877 }
    878 
    879 
    880 /**
    881  * Load an unswizzled block of pixels from memory
    882  */
    883 static void
    884 load_unswizzled_block(struct gallivm_state *gallivm,
    885                       LLVMValueRef base_ptr,
    886                       LLVMValueRef stride,
    887                       unsigned block_width,
    888                       unsigned block_height,
    889                       LLVMValueRef* dst,
    890                       struct lp_type dst_type,
    891                       unsigned dst_count,
    892                       unsigned dst_alignment)
    893 {
    894    LLVMBuilderRef builder = gallivm->builder;
    895    unsigned row_size = dst_count / block_height;
    896    unsigned i;
    897 
    898    /* Ensure block exactly fits into dst */
    899    assert((block_width * block_height) % dst_count == 0);
    900 
    901    for (i = 0; i < dst_count; ++i) {
    902       unsigned x = i % row_size;
    903       unsigned y = i / row_size;
    904 
    905       LLVMValueRef bx = lp_build_const_int32(gallivm, x * (dst_type.width / 8) * dst_type.length);
    906       LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, "");
    907 
    908       LLVMValueRef gep[2];
    909       LLVMValueRef dst_ptr;
    910 
    911       gep[0] = lp_build_const_int32(gallivm, 0);
    912       gep[1] = LLVMBuildAdd(builder, bx, by, "");
    913 
    914       dst_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, "");
    915       dst_ptr = LLVMBuildBitCast(builder, dst_ptr,
    916                                  LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0), "");
    917 
    918       dst[i] = LLVMBuildLoad(builder, dst_ptr, "");
    919 
    920       LLVMSetAlignment(dst[i], dst_alignment);
    921    }
    922 }
    923 
    924 
    925 /**
    926  * Store an unswizzled block of pixels to memory
    927  */
    928 static void
    929 store_unswizzled_block(struct gallivm_state *gallivm,
    930                        LLVMValueRef base_ptr,
    931                        LLVMValueRef stride,
    932                        unsigned block_width,
    933                        unsigned block_height,
    934                        LLVMValueRef* src,
    935                        struct lp_type src_type,
    936                        unsigned src_count,
    937                        unsigned src_alignment)
    938 {
    939    LLVMBuilderRef builder = gallivm->builder;
    940    unsigned row_size = src_count / block_height;
    941    unsigned i;
    942 
    943    /* Ensure src exactly fits into block */
    944    assert((block_width * block_height) % src_count == 0);
    945 
    946    for (i = 0; i < src_count; ++i) {
    947       unsigned x = i % row_size;
    948       unsigned y = i / row_size;
    949 
    950       LLVMValueRef bx = lp_build_const_int32(gallivm, x * (src_type.width / 8) * src_type.length);
    951       LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, "");
    952 
    953       LLVMValueRef gep[2];
    954       LLVMValueRef src_ptr;
    955 
    956       gep[0] = lp_build_const_int32(gallivm, 0);
    957       gep[1] = LLVMBuildAdd(builder, bx, by, "");
    958 
    959       src_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, "");
    960       src_ptr = LLVMBuildBitCast(builder, src_ptr,
    961                                  LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0), "");
    962 
    963       src_ptr = LLVMBuildStore(builder, src[i], src_ptr);
    964 
    965       LLVMSetAlignment(src_ptr, src_alignment);
    966    }
    967 }
    968 
    969 
    970 /**
    971  * Checks if a format description is an arithmetic format
    972  *
    973  * A format which has irregular channel sizes such as R3_G3_B2 or R5_G6_B5.
    974  */
    975 static inline boolean
    976 is_arithmetic_format(const struct util_format_description *format_desc)
    977 {
    978    boolean arith = false;
    979    unsigned i;
    980 
    981    for (i = 0; i < format_desc->nr_channels; ++i) {
    982       arith |= format_desc->channel[i].size != format_desc->channel[0].size;
    983       arith |= (format_desc->channel[i].size % 8) != 0;
    984    }
    985 
    986    return arith;
    987 }
    988 
    989 
    990 /**
    991  * Checks if this format requires special handling due to required expansion
    992  * to floats for blending, and furthermore has "natural" packed AoS -> unpacked
    993  * SoA conversion.
    994  */
    995 static inline boolean
    996 format_expands_to_float_soa(const struct util_format_description *format_desc)
    997 {
    998    if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
    999        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
   1000       return true;
   1001    }
   1002    return false;
   1003 }
   1004 
   1005 
   1006 /**
   1007  * Retrieves the type representing the memory layout for a format
   1008  *
   1009  * e.g. RGBA16F = 4x half-float and R3G3B2 = 1x byte
   1010  */
   1011 static inline void
   1012 lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
   1013                              struct lp_type* type)
   1014 {
   1015    unsigned i;
   1016    unsigned chan;
   1017 
   1018    if (format_expands_to_float_soa(format_desc)) {
   1019       /* just make this a uint with width of block */
   1020       type->floating = false;
   1021       type->fixed = false;
   1022       type->sign = false;
   1023       type->norm = false;
   1024       type->width = format_desc->block.bits;
   1025       type->length = 1;
   1026       return;
   1027    }
   1028 
   1029    for (i = 0; i < 4; i++)
   1030       if (format_desc->channel[i].type != UTIL_FORMAT_TYPE_VOID)
   1031          break;
   1032    chan = i;
   1033 
   1034    memset(type, 0, sizeof(struct lp_type));
   1035    type->floating = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FLOAT;
   1036    type->fixed    = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FIXED;
   1037    type->sign     = format_desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED;
   1038    type->norm     = format_desc->channel[chan].normalized;
   1039 
   1040    if (is_arithmetic_format(format_desc)) {
   1041       type->width = 0;
   1042       type->length = 1;
   1043 
   1044       for (i = 0; i < format_desc->nr_channels; ++i) {
   1045          type->width += format_desc->channel[i].size;
   1046       }
   1047    } else {
   1048       type->width = format_desc->channel[chan].size;
   1049       type->length = format_desc->nr_channels;
   1050    }
   1051 }
   1052 
   1053 
   1054 /**
   1055  * Retrieves the type for a format which is usable in the blending code.
   1056  *
   1057  * e.g. RGBA16F = 4x float, R3G3B2 = 3x byte
   1058  */
   1059 static inline void
   1060 lp_blend_type_from_format_desc(const struct util_format_description *format_desc,
   1061                                struct lp_type* type)
   1062 {
   1063    unsigned i;
   1064    unsigned chan;
   1065 
   1066    if (format_expands_to_float_soa(format_desc)) {
   1067       /* always use ordinary floats for blending */
   1068       type->floating = true;
   1069       type->fixed = false;
   1070       type->sign = true;
   1071       type->norm = false;
   1072       type->width = 32;
   1073       type->length = 4;
   1074       return;
   1075    }
   1076 
   1077    for (i = 0; i < 4; i++)
   1078       if (format_desc->channel[i].type != UTIL_FORMAT_TYPE_VOID)
   1079          break;
   1080    chan = i;
   1081 
   1082    memset(type, 0, sizeof(struct lp_type));
   1083    type->floating = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FLOAT;
   1084    type->fixed    = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FIXED;
   1085    type->sign     = format_desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED;
   1086    type->norm     = format_desc->channel[chan].normalized;
   1087    type->width    = format_desc->channel[chan].size;
   1088    type->length   = format_desc->nr_channels;
   1089 
   1090    for (i = 1; i < format_desc->nr_channels; ++i) {
   1091       if (format_desc->channel[i].size > type->width)
   1092          type->width = format_desc->channel[i].size;
   1093    }
   1094 
   1095    if (type->floating) {
   1096       type->width = 32;
   1097    } else {
   1098       if (type->width <= 8) {
   1099          type->width = 8;
   1100       } else if (type->width <= 16) {
   1101          type->width = 16;
   1102       } else {
   1103          type->width = 32;
   1104       }
   1105    }
   1106 
   1107    if (is_arithmetic_format(format_desc) && type->length == 3) {
   1108       type->length = 4;
   1109    }
   1110 }
   1111 
   1112 
   1113 /**
   1114  * Scale a normalized value from src_bits to dst_bits.
   1115  *
   1116  * The exact calculation is
   1117  *
   1118  *    dst = iround(src * dst_mask / src_mask)
   1119  *
   1120  *  or with integer rounding
   1121  *
   1122  *    dst = src * (2*dst_mask + sign(src)*src_mask) / (2*src_mask)
   1123  *
   1124  *  where
   1125  *
   1126  *    src_mask = (1 << src_bits) - 1
   1127  *    dst_mask = (1 << dst_bits) - 1
   1128  *
   1129  * but we try to avoid division and multiplication through shifts.
   1130  */
   1131 static inline LLVMValueRef
   1132 scale_bits(struct gallivm_state *gallivm,
   1133            int src_bits,
   1134            int dst_bits,
   1135            LLVMValueRef src,
   1136            struct lp_type src_type)
   1137 {
   1138    LLVMBuilderRef builder = gallivm->builder;
   1139    LLVMValueRef result = src;
   1140 
   1141    if (dst_bits < src_bits) {
   1142       int delta_bits = src_bits - dst_bits;
   1143 
   1144       if (delta_bits <= dst_bits) {
   1145          /*
   1146           * Approximate the rescaling with a single shift.
   1147           *
   1148           * This gives the wrong rounding.
   1149           */
   1150 
   1151          result = LLVMBuildLShr(builder,
   1152                                 src,
   1153                                 lp_build_const_int_vec(gallivm, src_type, delta_bits),
   1154                                 "");
   1155 
   1156       } else {
   1157          /*
   1158           * Try more accurate rescaling.
   1159           */
   1160 
   1161          /*
   1162           * Drop the least significant bits to make space for the multiplication.
   1163           *
   1164           * XXX: A better approach would be to use a wider integer type as intermediate.  But
   1165           * this is enough to convert alpha from 16bits -> 2 when rendering to
   1166           * PIPE_FORMAT_R10G10B10A2_UNORM.
   1167           */
   1168          result = LLVMBuildLShr(builder,
   1169                                 src,
   1170                                 lp_build_const_int_vec(gallivm, src_type, dst_bits),
   1171                                 "");
   1172 
   1173 
   1174          result = LLVMBuildMul(builder,
   1175                                result,
   1176                                lp_build_const_int_vec(gallivm, src_type, (1LL << dst_bits) - 1),
   1177                                "");
   1178 
   1179          /*
   1180           * Add a rounding term before the division.
   1181           *
   1182           * TODO: Handle signed integers too.
   1183           */
   1184          if (!src_type.sign) {
   1185             result = LLVMBuildAdd(builder,
   1186                                   result,
   1187                                   lp_build_const_int_vec(gallivm, src_type, (1LL << (delta_bits - 1))),
   1188                                   "");
   1189          }
   1190 
   1191          /*
   1192           * Approximate the division by src_mask with a src_bits shift.
   1193           *
   1194           * Given the src has already been shifted by dst_bits, all we need
   1195           * to do is to shift by the difference.
   1196           */
   1197 
   1198          result = LLVMBuildLShr(builder,
   1199                                 result,
   1200                                 lp_build_const_int_vec(gallivm, src_type, delta_bits),
   1201                                 "");
   1202       }
   1203 
   1204    } else if (dst_bits > src_bits) {
   1205       /* Scale up bits */
   1206       int db = dst_bits - src_bits;
   1207 
   1208       /* Shift left by difference in bits */
   1209       result = LLVMBuildShl(builder,
   1210                             src,
   1211                             lp_build_const_int_vec(gallivm, src_type, db),
   1212                             "");
   1213 
   1214       if (db <= src_bits) {
   1215          /* Enough bits in src to fill the remainder */
   1216          LLVMValueRef lower = LLVMBuildLShr(builder,
   1217                                             src,
   1218                                             lp_build_const_int_vec(gallivm, src_type, src_bits - db),
   1219                                             "");
   1220 
   1221          result = LLVMBuildOr(builder, result, lower, "");
   1222       } else if (db > src_bits) {
   1223          /* Need to repeatedly copy src bits to fill remainder in dst */
   1224          unsigned n;
   1225 
   1226          for (n = src_bits; n < dst_bits; n *= 2) {
   1227             LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n);
   1228 
   1229             result = LLVMBuildOr(builder,
   1230                                  result,
   1231                                  LLVMBuildLShr(builder, result, shuv, ""),
   1232                                  "");
   1233          }
   1234       }
   1235    }
   1236 
   1237    return result;
   1238 }
   1239 
   1240 /**
   1241  * If RT is a smallfloat (needing denorms) format
   1242  */
   1243 static inline int
   1244 have_smallfloat_format(struct lp_type dst_type,
   1245                        enum pipe_format format)
   1246 {
   1247    return ((dst_type.floating && dst_type.width != 32) ||
   1248     /* due to format handling hacks this format doesn't have floating set
   1249      * here (and actually has width set to 32 too) so special case this. */
   1250     (format == PIPE_FORMAT_R11G11B10_FLOAT));
   1251 }
   1252 
   1253 
   1254 /**
   1255  * Convert from memory format to blending format
   1256  *
   1257  * e.g. GL_R3G3B2 is 1 byte in memory but 3 bytes for blending
   1258  */
   1259 static void
   1260 convert_to_blend_type(struct gallivm_state *gallivm,
   1261                       unsigned block_size,
   1262                       const struct util_format_description *src_fmt,
   1263                       struct lp_type src_type,
   1264                       struct lp_type dst_type,
   1265                       LLVMValueRef* src, // and dst
   1266                       unsigned num_srcs)
   1267 {
   1268    LLVMValueRef *dst = src;
   1269    LLVMBuilderRef builder = gallivm->builder;
   1270    struct lp_type blend_type;
   1271    struct lp_type mem_type;
   1272    unsigned i, j;
   1273    unsigned pixels = block_size / num_srcs;
   1274    bool is_arith;
   1275 
   1276    /*
   1277     * full custom path for packed floats and srgb formats - none of the later
   1278     * functions would do anything useful, and given the lp_type representation they
   1279     * can't be fixed. Should really have some SoA blend path for these kind of
   1280     * formats rather than hacking them in here.
   1281     */
   1282    if (format_expands_to_float_soa(src_fmt)) {
   1283       LLVMValueRef tmpsrc[4];
   1284       /*
   1285        * This is pretty suboptimal for this case blending in SoA would be much
   1286        * better, since conversion gets us SoA values so need to convert back.
   1287        */
   1288       assert(src_type.width == 32 || src_type.width == 16);
   1289       assert(dst_type.floating);
   1290       assert(dst_type.width == 32);
   1291       assert(dst_type.length % 4 == 0);
   1292       assert(num_srcs % 4 == 0);
   1293 
   1294       if (src_type.width == 16) {
   1295          /* expand 4x16bit values to 4x32bit */
   1296          struct lp_type type32x4 = src_type;
   1297          LLVMTypeRef ltype32x4;
   1298          unsigned num_fetch = dst_type.length == 8 ? num_srcs / 2 : num_srcs / 4;
   1299          type32x4.width = 32;
   1300          ltype32x4 = lp_build_vec_type(gallivm, type32x4);
   1301          for (i = 0; i < num_fetch; i++) {
   1302             src[i] = LLVMBuildZExt(builder, src[i], ltype32x4, "");
   1303          }
   1304          src_type.width = 32;
   1305       }
   1306       for (i = 0; i < 4; i++) {
   1307          tmpsrc[i] = src[i];
   1308       }
   1309       for (i = 0; i < num_srcs / 4; i++) {
   1310          LLVMValueRef tmpsoa[4];
   1311          LLVMValueRef tmps = tmpsrc[i];
   1312          if (dst_type.length == 8) {
   1313             LLVMValueRef shuffles[8];
   1314             unsigned j;
   1315             /* fetch was 4 values but need 8-wide output values */
   1316             tmps = lp_build_concat(gallivm, &tmpsrc[i * 2], src_type, 2);
   1317             /*
   1318              * for 8-wide aos transpose would give us wrong order not matching
   1319              * incoming converted fs values and mask. ARGH.
   1320              */
   1321             for (j = 0; j < 4; j++) {
   1322                shuffles[j] = lp_build_const_int32(gallivm, j * 2);
   1323                shuffles[j + 4] = lp_build_const_int32(gallivm, j * 2 + 1);
   1324             }
   1325             tmps = LLVMBuildShuffleVector(builder, tmps, tmps,
   1326                                           LLVMConstVector(shuffles, 8), "");
   1327          }
   1328          if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
   1329             lp_build_r11g11b10_to_float(gallivm, tmps, tmpsoa);
   1330          }
   1331          else {
   1332             lp_build_unpack_rgba_soa(gallivm, src_fmt, dst_type, tmps, tmpsoa);
   1333          }
   1334          lp_build_transpose_aos(gallivm, dst_type, tmpsoa, &src[i * 4]);
   1335       }
   1336       return;
   1337    }
   1338 
   1339    lp_mem_type_from_format_desc(src_fmt, &mem_type);
   1340    lp_blend_type_from_format_desc(src_fmt, &blend_type);
   1341 
   1342    /* Is the format arithmetic */
   1343    is_arith = blend_type.length * blend_type.width != mem_type.width * mem_type.length;
   1344    is_arith &= !(mem_type.width == 16 && mem_type.floating);
   1345 
   1346    /* Pad if necessary */
   1347    if (!is_arith && src_type.length < dst_type.length) {
   1348       for (i = 0; i < num_srcs; ++i) {
   1349          dst[i] = lp_build_pad_vector(gallivm, src[i], dst_type.length);
   1350       }
   1351 
   1352       src_type.length = dst_type.length;
   1353    }
   1354 
   1355    /* Special case for half-floats */
   1356    if (mem_type.width == 16 && mem_type.floating) {
   1357       assert(blend_type.width == 32 && blend_type.floating);
   1358       lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst);
   1359       is_arith = false;
   1360    }
   1361 
   1362    if (!is_arith) {
   1363       return;
   1364    }
   1365 
   1366    src_type.width = blend_type.width * blend_type.length;
   1367    blend_type.length *= pixels;
   1368    src_type.length *= pixels / (src_type.length / mem_type.length);
   1369 
   1370    for (i = 0; i < num_srcs; ++i) {
   1371       LLVMValueRef chans[4];
   1372       LLVMValueRef res = NULL;
   1373 
   1374       dst[i] = LLVMBuildZExt(builder, src[i], lp_build_vec_type(gallivm, src_type), "");
   1375 
   1376       for (j = 0; j < src_fmt->nr_channels; ++j) {
   1377          unsigned mask = 0;
   1378          unsigned sa = src_fmt->channel[j].shift;
   1379 #ifdef PIPE_ARCH_LITTLE_ENDIAN
   1380          unsigned from_lsb = j;
   1381 #else
   1382          unsigned from_lsb = src_fmt->nr_channels - j - 1;
   1383 #endif
   1384 
   1385          mask = (1 << src_fmt->channel[j].size) - 1;
   1386 
   1387          /* Extract bits from source */
   1388          chans[j] = LLVMBuildLShr(builder,
   1389                                   dst[i],
   1390                                   lp_build_const_int_vec(gallivm, src_type, sa),
   1391                                   "");
   1392 
   1393          chans[j] = LLVMBuildAnd(builder,
   1394                                  chans[j],
   1395                                  lp_build_const_int_vec(gallivm, src_type, mask),
   1396                                  "");
   1397 
   1398          /* Scale bits */
   1399          if (src_type.norm) {
   1400             chans[j] = scale_bits(gallivm, src_fmt->channel[j].size,
   1401                                   blend_type.width, chans[j], src_type);
   1402          }
   1403 
   1404          /* Insert bits into correct position */
   1405          chans[j] = LLVMBuildShl(builder,
   1406                                  chans[j],
   1407                                  lp_build_const_int_vec(gallivm, src_type, from_lsb * blend_type.width),
   1408                                  "");
   1409 
   1410          if (j == 0) {
   1411             res = chans[j];
   1412          } else {
   1413             res = LLVMBuildOr(builder, res, chans[j], "");
   1414          }
   1415       }
   1416 
   1417       dst[i] = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, blend_type), "");
   1418    }
   1419 }
   1420 
   1421 
   1422 /**
   1423  * Convert from blending format to memory format
   1424  *
   1425  * e.g. GL_R3G3B2 is 3 bytes for blending but 1 byte in memory
   1426  */
   1427 static void
   1428 convert_from_blend_type(struct gallivm_state *gallivm,
   1429                         unsigned block_size,
   1430                         const struct util_format_description *src_fmt,
   1431                         struct lp_type src_type,
   1432                         struct lp_type dst_type,
   1433                         LLVMValueRef* src, // and dst
   1434                         unsigned num_srcs)
   1435 {
   1436    LLVMValueRef* dst = src;
   1437    unsigned i, j, k;
   1438    struct lp_type mem_type;
   1439    struct lp_type blend_type;
   1440    LLVMBuilderRef builder = gallivm->builder;
   1441    unsigned pixels = block_size / num_srcs;
   1442    bool is_arith;
   1443 
   1444    /*
   1445     * full custom path for packed floats and srgb formats - none of the later
   1446     * functions would do anything useful, and given the lp_type representation they
   1447     * can't be fixed. Should really have some SoA blend path for these kind of
   1448     * formats rather than hacking them in here.
   1449     */
   1450    if (format_expands_to_float_soa(src_fmt)) {
   1451       /*
   1452        * This is pretty suboptimal for this case blending in SoA would be much
   1453        * better - we need to transpose the AoS values back to SoA values for
   1454        * conversion/packing.
   1455        */
   1456       assert(src_type.floating);
   1457       assert(src_type.width == 32);
   1458       assert(src_type.length % 4 == 0);
   1459       assert(dst_type.width == 32 || dst_type.width == 16);
   1460 
   1461       for (i = 0; i < num_srcs / 4; i++) {
   1462          LLVMValueRef tmpsoa[4], tmpdst;
   1463          lp_build_transpose_aos(gallivm, src_type, &src[i * 4], tmpsoa);
   1464          /* really really need SoA here */
   1465 
   1466          if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
   1467             tmpdst = lp_build_float_to_r11g11b10(gallivm, tmpsoa);
   1468          }
   1469          else {
   1470             tmpdst = lp_build_float_to_srgb_packed(gallivm, src_fmt,
   1471                                                    src_type, tmpsoa);
   1472          }
   1473 
   1474          if (src_type.length == 8) {
   1475             LLVMValueRef tmpaos, shuffles[8];
   1476             unsigned j;
   1477             /*
   1478              * for 8-wide aos transpose has given us wrong order not matching
   1479              * output order. HMPF. Also need to split the output values manually.
   1480              */
   1481             for (j = 0; j < 4; j++) {
   1482                shuffles[j * 2] = lp_build_const_int32(gallivm, j);
   1483                shuffles[j * 2 + 1] = lp_build_const_int32(gallivm, j + 4);
   1484             }
   1485             tmpaos = LLVMBuildShuffleVector(builder, tmpdst, tmpdst,
   1486                                             LLVMConstVector(shuffles, 8), "");
   1487             src[i * 2] = lp_build_extract_range(gallivm, tmpaos, 0, 4);
   1488             src[i * 2 + 1] = lp_build_extract_range(gallivm, tmpaos, 4, 4);
   1489          }
   1490          else {
   1491             src[i] = tmpdst;
   1492          }
   1493       }
   1494       if (dst_type.width == 16) {
   1495          struct lp_type type16x8 = dst_type;
   1496          struct lp_type type32x4 = dst_type;
   1497          LLVMTypeRef ltype16x4, ltypei64, ltypei128;
   1498          unsigned num_fetch = src_type.length == 8 ? num_srcs / 2 : num_srcs / 4;
   1499          type16x8.length = 8;
   1500          type32x4.width = 32;
   1501          ltypei128 = LLVMIntTypeInContext(gallivm->context, 128);
   1502          ltypei64 = LLVMIntTypeInContext(gallivm->context, 64);
   1503          ltype16x4 = lp_build_vec_type(gallivm, dst_type);
   1504          /* We could do vector truncation but it doesn't generate very good code */
   1505          for (i = 0; i < num_fetch; i++) {
   1506             src[i] = lp_build_pack2(gallivm, type32x4, type16x8,
   1507                                     src[i], lp_build_zero(gallivm, type32x4));
   1508             src[i] = LLVMBuildBitCast(builder, src[i], ltypei128, "");
   1509             src[i] = LLVMBuildTrunc(builder, src[i], ltypei64, "");
   1510             src[i] = LLVMBuildBitCast(builder, src[i], ltype16x4, "");
   1511          }
   1512       }
   1513       return;
   1514    }
   1515 
   1516    lp_mem_type_from_format_desc(src_fmt, &mem_type);
   1517    lp_blend_type_from_format_desc(src_fmt, &blend_type);
   1518 
   1519    is_arith = (blend_type.length * blend_type.width != mem_type.width * mem_type.length);
   1520 
   1521    /* Special case for half-floats */
   1522    if (mem_type.width == 16 && mem_type.floating) {
   1523       int length = dst_type.length;
   1524       assert(blend_type.width == 32 && blend_type.floating);
   1525 
   1526       dst_type.length = src_type.length;
   1527 
   1528       lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst);
   1529 
   1530       dst_type.length = length;
   1531       is_arith = false;
   1532    }
   1533 
   1534    /* Remove any padding */
   1535    if (!is_arith && (src_type.length % mem_type.length)) {
   1536       src_type.length -= (src_type.length % mem_type.length);
   1537 
   1538       for (i = 0; i < num_srcs; ++i) {
   1539          dst[i] = lp_build_extract_range(gallivm, dst[i], 0, src_type.length);
   1540       }
   1541    }
   1542 
   1543    /* No bit arithmetic to do */
   1544    if (!is_arith) {
   1545       return;
   1546    }
   1547 
   1548    src_type.length = pixels;
   1549    src_type.width = blend_type.length * blend_type.width;
   1550    dst_type.length = pixels;
   1551 
   1552    for (i = 0; i < num_srcs; ++i) {
   1553       LLVMValueRef chans[4];
   1554       LLVMValueRef res = NULL;
   1555 
   1556       dst[i] = LLVMBuildBitCast(builder, src[i], lp_build_vec_type(gallivm, src_type), "");
   1557 
   1558       for (j = 0; j < src_fmt->nr_channels; ++j) {
   1559          unsigned mask = 0;
   1560          unsigned sa = src_fmt->channel[j].shift;
   1561 #ifdef PIPE_ARCH_LITTLE_ENDIAN
   1562          unsigned from_lsb = j;
   1563 #else
   1564          unsigned from_lsb = src_fmt->nr_channels - j - 1;
   1565 #endif
   1566 
   1567          assert(blend_type.width > src_fmt->channel[j].size);
   1568 
   1569          for (k = 0; k < blend_type.width; ++k) {
   1570             mask |= 1 << k;
   1571          }
   1572 
   1573          /* Extract bits */
   1574          chans[j] = LLVMBuildLShr(builder,
   1575                                   dst[i],
   1576                                   lp_build_const_int_vec(gallivm, src_type,
   1577                                                          from_lsb * blend_type.width),
   1578                                   "");
   1579 
   1580          chans[j] = LLVMBuildAnd(builder,
   1581                                  chans[j],
   1582                                  lp_build_const_int_vec(gallivm, src_type, mask),
   1583                                  "");
   1584 
   1585          /* Scale down bits */
   1586          if (src_type.norm) {
   1587             chans[j] = scale_bits(gallivm, blend_type.width,
   1588                                   src_fmt->channel[j].size, chans[j], src_type);
   1589          }
   1590 
   1591          /* Insert bits */
   1592          chans[j] = LLVMBuildShl(builder,
   1593                                  chans[j],
   1594                                  lp_build_const_int_vec(gallivm, src_type, sa),
   1595                                  "");
   1596 
   1597          sa += src_fmt->channel[j].size;
   1598 
   1599          if (j == 0) {
   1600             res = chans[j];
   1601          } else {
   1602             res = LLVMBuildOr(builder, res, chans[j], "");
   1603          }
   1604       }
   1605 
   1606       assert (dst_type.width != 24);
   1607 
   1608       dst[i] = LLVMBuildTrunc(builder, res, lp_build_vec_type(gallivm, dst_type), "");
   1609    }
   1610 }
   1611 
   1612 
   1613 /**
   1614  * Convert alpha to same blend type as src
   1615  */
   1616 static void
   1617 convert_alpha(struct gallivm_state *gallivm,
   1618               struct lp_type row_type,
   1619               struct lp_type alpha_type,
   1620               const unsigned block_size,
   1621               const unsigned block_height,
   1622               const unsigned src_count,
   1623               const unsigned dst_channels,
   1624               const bool pad_inline,
   1625               LLVMValueRef* src_alpha)
   1626 {
   1627    LLVMBuilderRef builder = gallivm->builder;
   1628    unsigned i, j;
   1629    unsigned length = row_type.length;
   1630    row_type.length = alpha_type.length;
   1631 
   1632    /* Twiddle the alpha to match pixels */
   1633    lp_bld_quad_twiddle(gallivm, alpha_type, src_alpha, block_height, src_alpha);
   1634 
   1635    /*
   1636     * TODO this should use single lp_build_conv call for
   1637     * src_count == 1 && dst_channels == 1 case (dropping the concat below)
   1638     */
   1639    for (i = 0; i < block_height; ++i) {
   1640       lp_build_conv(gallivm, alpha_type, row_type, &src_alpha[i], 1, &src_alpha[i], 1);
   1641    }
   1642 
   1643    alpha_type = row_type;
   1644    row_type.length = length;
   1645 
   1646    /* If only one channel we can only need the single alpha value per pixel */
   1647    if (src_count == 1 && dst_channels == 1) {
   1648 
   1649       lp_build_concat_n(gallivm, alpha_type, src_alpha, block_height, src_alpha, src_count);
   1650    } else {
   1651       /* If there are more srcs than rows then we need to split alpha up */
   1652       if (src_count > block_height) {
   1653          for (i = src_count; i > 0; --i) {
   1654             unsigned pixels = block_size / src_count;
   1655             unsigned idx = i - 1;
   1656 
   1657             src_alpha[idx] = lp_build_extract_range(gallivm, src_alpha[(idx * pixels) / 4],
   1658                                                     (idx * pixels) % 4, pixels);
   1659          }
   1660       }
   1661 
   1662       /* If there is a src for each pixel broadcast the alpha across whole row */
   1663       if (src_count == block_size) {
   1664          for (i = 0; i < src_count; ++i) {
   1665             src_alpha[i] = lp_build_broadcast(gallivm,
   1666                               lp_build_vec_type(gallivm, row_type), src_alpha[i]);
   1667          }
   1668       } else {
   1669          unsigned pixels = block_size / src_count;
   1670          unsigned channels = pad_inline ? TGSI_NUM_CHANNELS : dst_channels;
   1671          unsigned alpha_span = 1;
   1672          LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
   1673 
   1674          /* Check if we need 2 src_alphas for our shuffles */
   1675          if (pixels > alpha_type.length) {
   1676             alpha_span = 2;
   1677          }
   1678 
   1679          /* Broadcast alpha across all channels, e.g. a1a2 to a1a1a1a1a2a2a2a2 */
   1680          for (j = 0; j < row_type.length; ++j) {
   1681             if (j < pixels * channels) {
   1682                shuffles[j] = lp_build_const_int32(gallivm, j / channels);
   1683             } else {
   1684                shuffles[j] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
   1685             }
   1686          }
   1687 
   1688          for (i = 0; i < src_count; ++i) {
   1689             unsigned idx1 = i, idx2 = i;
   1690 
   1691             if (alpha_span > 1){
   1692                idx1 *= alpha_span;
   1693                idx2 = idx1 + 1;
   1694             }
   1695 
   1696             src_alpha[i] = LLVMBuildShuffleVector(builder,
   1697                                                   src_alpha[idx1],
   1698                                                   src_alpha[idx2],
   1699                                                   LLVMConstVector(shuffles, row_type.length),
   1700                                                   "");
   1701          }
   1702       }
   1703    }
   1704 }
   1705 
   1706 
   1707 /**
   1708  * Generates the blend function for unswizzled colour buffers
   1709  * Also generates the read & write from colour buffer
   1710  */
   1711 static void
   1712 generate_unswizzled_blend(struct gallivm_state *gallivm,
   1713                           unsigned rt,
   1714                           struct lp_fragment_shader_variant *variant,
   1715                           enum pipe_format out_format,
   1716                           unsigned int num_fs,
   1717                           struct lp_type fs_type,
   1718                           LLVMValueRef* fs_mask,
   1719                           LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][4],
   1720                           LLVMValueRef context_ptr,
   1721                           LLVMValueRef color_ptr,
   1722                           LLVMValueRef stride,
   1723                           unsigned partial_mask,
   1724                           boolean do_branch)
   1725 {
   1726    const unsigned alpha_channel = 3;
   1727    const unsigned block_width = LP_RASTER_BLOCK_SIZE;
   1728    const unsigned block_height = LP_RASTER_BLOCK_SIZE;
   1729    const unsigned block_size = block_width * block_height;
   1730    const unsigned lp_integer_vector_width = 128;
   1731 
   1732    LLVMBuilderRef builder = gallivm->builder;
   1733    LLVMValueRef fs_src[4][TGSI_NUM_CHANNELS];
   1734    LLVMValueRef fs_src1[4][TGSI_NUM_CHANNELS];
   1735    LLVMValueRef src_alpha[4 * 4];
   1736    LLVMValueRef src1_alpha[4 * 4] = { NULL };
   1737    LLVMValueRef src_mask[4 * 4];
   1738    LLVMValueRef src[4 * 4];
   1739    LLVMValueRef src1[4 * 4];
   1740    LLVMValueRef dst[4 * 4];
   1741    LLVMValueRef blend_color;
   1742    LLVMValueRef blend_alpha;
   1743    LLVMValueRef i32_zero;
   1744    LLVMValueRef check_mask;
   1745    LLVMValueRef undef_src_val;
   1746 
   1747    struct lp_build_mask_context mask_ctx;
   1748    struct lp_type mask_type;
   1749    struct lp_type blend_type;
   1750    struct lp_type row_type;
   1751    struct lp_type dst_type;
   1752    struct lp_type ls_type;
   1753 
   1754    unsigned char swizzle[TGSI_NUM_CHANNELS];
   1755    unsigned vector_width;
   1756    unsigned src_channels = TGSI_NUM_CHANNELS;
   1757    unsigned dst_channels;
   1758    unsigned dst_count;
   1759    unsigned src_count;
   1760    unsigned i, j;
   1761 
   1762    const struct util_format_description* out_format_desc = util_format_description(out_format);
   1763 
   1764    unsigned dst_alignment;
   1765 
   1766    bool pad_inline = is_arithmetic_format(out_format_desc);
   1767    bool has_alpha = false;
   1768    const boolean dual_source_blend = variant->key.blend.rt[0].blend_enable &&
   1769                                      util_blend_state_is_dual(&variant->key.blend, 0);
   1770 
   1771    const boolean is_1d = variant->key.resource_1d;
   1772    boolean twiddle_after_convert = FALSE;
   1773    unsigned num_fullblock_fs = is_1d ? 2 * num_fs : num_fs;
   1774    LLVMValueRef fpstate = 0;
   1775 
   1776    /* Get type from output format */
   1777    lp_blend_type_from_format_desc(out_format_desc, &row_type);
   1778    lp_mem_type_from_format_desc(out_format_desc, &dst_type);
   1779 
   1780    /*
   1781     * Technically this code should go into lp_build_smallfloat_to_float
   1782     * and lp_build_float_to_smallfloat but due to the
   1783     * http://llvm.org/bugs/show_bug.cgi?id=6393
   1784     * llvm reorders the mxcsr intrinsics in a way that breaks the code.
   1785     * So the ordering is important here and there shouldn't be any
   1786     * llvm ir instrunctions in this function before
   1787     * this, otherwise half-float format conversions won't work
   1788     * (again due to llvm bug #6393).
   1789     */
   1790    if (have_smallfloat_format(dst_type, out_format)) {
   1791       /* We need to make sure that denorms are ok for half float
   1792          conversions */
   1793       fpstate = lp_build_fpstate_get(gallivm);
   1794       lp_build_fpstate_set_denorms_zero(gallivm, FALSE);
   1795    }
   1796 
   1797    mask_type = lp_int32_vec4_type();
   1798    mask_type.length = fs_type.length;
   1799 
   1800    for (i = num_fs; i < num_fullblock_fs; i++) {
   1801       fs_mask[i] = lp_build_zero(gallivm, mask_type);
   1802    }
   1803 
   1804    /* Do not bother executing code when mask is empty.. */
   1805    if (do_branch) {
   1806       check_mask = LLVMConstNull(lp_build_int_vec_type(gallivm, mask_type));
   1807 
   1808       for (i = 0; i < num_fullblock_fs; ++i) {
   1809          check_mask = LLVMBuildOr(builder, check_mask, fs_mask[i], "");
   1810       }
   1811 
   1812       lp_build_mask_begin(&mask_ctx, gallivm, mask_type, check_mask);
   1813       lp_build_mask_check(&mask_ctx);
   1814    }
   1815 
   1816    partial_mask |= !variant->opaque;
   1817    i32_zero = lp_build_const_int32(gallivm, 0);
   1818 
   1819    undef_src_val = lp_build_undef(gallivm, fs_type);
   1820 
   1821    row_type.length = fs_type.length;
   1822    vector_width    = dst_type.floating ? lp_native_vector_width : lp_integer_vector_width;
   1823 
   1824    /* Compute correct swizzle and count channels */
   1825    memset(swizzle, LP_BLD_SWIZZLE_DONTCARE, TGSI_NUM_CHANNELS);
   1826    dst_channels = 0;
   1827 
   1828    for (i = 0; i < TGSI_NUM_CHANNELS; ++i) {
   1829       /* Ensure channel is used */
   1830       if (out_format_desc->swizzle[i] >= TGSI_NUM_CHANNELS) {
   1831          continue;
   1832       }
   1833 
   1834       /* Ensure not already written to (happens in case with GL_ALPHA) */
   1835       if (swizzle[out_format_desc->swizzle[i]] < TGSI_NUM_CHANNELS) {
   1836          continue;
   1837       }
   1838 
   1839       /* Ensure we havn't already found all channels */
   1840       if (dst_channels >= out_format_desc->nr_channels) {
   1841          continue;
   1842       }
   1843 
   1844       swizzle[out_format_desc->swizzle[i]] = i;
   1845       ++dst_channels;
   1846 
   1847       if (i == alpha_channel) {
   1848          has_alpha = true;
   1849       }
   1850    }
   1851 
   1852    if (format_expands_to_float_soa(out_format_desc)) {
   1853       /*
   1854        * the code above can't work for layout_other
   1855        * for srgb it would sort of work but we short-circuit swizzles, etc.
   1856        * as that is done as part of unpack / pack.
   1857        */
   1858       dst_channels = 4; /* HACK: this is fake 4 really but need it due to transpose stuff later */
   1859       has_alpha = true;
   1860       swizzle[0] = 0;
   1861       swizzle[1] = 1;
   1862       swizzle[2] = 2;
   1863       swizzle[3] = 3;
   1864       pad_inline = true; /* HACK: prevent rgbxrgbx->rgbrgbxx conversion later */
   1865    }
   1866 
   1867    /* If 3 channels then pad to include alpha for 4 element transpose */
   1868    if (dst_channels == 3) {
   1869       assert (!has_alpha);
   1870       for (i = 0; i < TGSI_NUM_CHANNELS; i++) {
   1871          if (swizzle[i] > TGSI_NUM_CHANNELS)
   1872             swizzle[i] = 3;
   1873       }
   1874       if (out_format_desc->nr_channels == 4) {
   1875          dst_channels = 4;
   1876          /*
   1877           * We use alpha from the color conversion, not separate one.
   1878           * We had to include it for transpose, hence it will get converted
   1879           * too (albeit when doing transpose after conversion, that would
   1880           * no longer be the case necessarily).
   1881           * (It works only with 4 channel dsts, e.g. rgbx formats, because
   1882           * otherwise we really have padding, not alpha, included.)
   1883           */
   1884          has_alpha = true;
   1885       }
   1886    }
   1887 
   1888    /*
   1889     * Load shader output
   1890     */
   1891    for (i = 0; i < num_fullblock_fs; ++i) {
   1892       /* Always load alpha for use in blending */
   1893       LLVMValueRef alpha;
   1894       if (i < num_fs) {
   1895          alpha = LLVMBuildLoad(builder, fs_out_color[rt][alpha_channel][i], "");
   1896       }
   1897       else {
   1898          alpha = undef_src_val;
   1899       }
   1900 
   1901       /* Load each channel */
   1902       for (j = 0; j < dst_channels; ++j) {
   1903          assert(swizzle[j] < 4);
   1904          if (i < num_fs) {
   1905             fs_src[i][j] = LLVMBuildLoad(builder, fs_out_color[rt][swizzle[j]][i], "");
   1906          }
   1907          else {
   1908             fs_src[i][j] = undef_src_val;
   1909          }
   1910       }
   1911 
   1912       /* If 3 channels then pad to include alpha for 4 element transpose */
   1913       /*
   1914        * XXX If we include that here maybe could actually use it instead of
   1915        * separate alpha for blending?
   1916        * (Difficult though we actually convert pad channels, not alpha.)
   1917        */
   1918       if (dst_channels == 3 && !has_alpha) {
   1919          fs_src[i][3] = alpha;
   1920       }
   1921 
   1922       /* We split the row_mask and row_alpha as we want 128bit interleave */
   1923       if (fs_type.length == 8) {
   1924          src_mask[i*2 + 0]  = lp_build_extract_range(gallivm, fs_mask[i],
   1925                                                      0, src_channels);
   1926          src_mask[i*2 + 1]  = lp_build_extract_range(gallivm, fs_mask[i],
   1927                                                      src_channels, src_channels);
   1928 
   1929          src_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels);
   1930          src_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha,
   1931                                                      src_channels, src_channels);
   1932       } else {
   1933          src_mask[i] = fs_mask[i];
   1934          src_alpha[i] = alpha;
   1935       }
   1936    }
   1937    if (dual_source_blend) {
   1938       /* same as above except different src/dst, skip masks and comments... */
   1939       for (i = 0; i < num_fullblock_fs; ++i) {
   1940          LLVMValueRef alpha;
   1941          if (i < num_fs) {
   1942             alpha = LLVMBuildLoad(builder, fs_out_color[1][alpha_channel][i], "");
   1943          }
   1944          else {
   1945             alpha = undef_src_val;
   1946          }
   1947 
   1948          for (j = 0; j < dst_channels; ++j) {
   1949             assert(swizzle[j] < 4);
   1950             if (i < num_fs) {
   1951                fs_src1[i][j] = LLVMBuildLoad(builder, fs_out_color[1][swizzle[j]][i], "");
   1952             }
   1953             else {
   1954                fs_src1[i][j] = undef_src_val;
   1955             }
   1956          }
   1957          if (dst_channels == 3 && !has_alpha) {
   1958             fs_src1[i][3] = alpha;
   1959          }
   1960          if (fs_type.length == 8) {
   1961             src1_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels);
   1962             src1_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha,
   1963                                                          src_channels, src_channels);
   1964          } else {
   1965             src1_alpha[i] = alpha;
   1966          }
   1967       }
   1968    }
   1969 
   1970    if (util_format_is_pure_integer(out_format)) {
   1971       /*
   1972        * In this case fs_type was really ints or uints disguised as floats,
   1973        * fix that up now.
   1974        */
   1975       fs_type.floating = 0;
   1976       fs_type.sign = dst_type.sign;
   1977       for (i = 0; i < num_fullblock_fs; ++i) {
   1978          for (j = 0; j < dst_channels; ++j) {
   1979             fs_src[i][j] = LLVMBuildBitCast(builder, fs_src[i][j],
   1980                                             lp_build_vec_type(gallivm, fs_type), "");
   1981          }
   1982          if (dst_channels == 3 && !has_alpha) {
   1983             fs_src[i][3] = LLVMBuildBitCast(builder, fs_src[i][3],
   1984                                             lp_build_vec_type(gallivm, fs_type), "");
   1985          }
   1986       }
   1987    }
   1988 
   1989    /*
   1990     * We actually should generally do conversion first (for non-1d cases)
   1991     * when the blend format is 8 or 16 bits. The reason is obvious,
   1992     * there's 2 or 4 times less vectors to deal with for the interleave...
   1993     * Albeit for the AVX (not AVX2) case there's no benefit with 16 bit
   1994     * vectors (as it can do 32bit unpack with 256bit vectors, but 8/16bit
   1995     * unpack only with 128bit vectors).
   1996     * Note: for 16bit sizes really need matching pack conversion code
   1997     */
   1998    if (!is_1d && dst_channels != 3 && dst_type.width == 8) {
   1999       twiddle_after_convert = TRUE;
   2000    }
   2001 
   2002    /*
   2003     * Pixel twiddle from fragment shader order to memory order
   2004     */
   2005    if (!twiddle_after_convert) {
   2006       src_count = generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs,
   2007                                       dst_channels, fs_src, src, pad_inline);
   2008       if (dual_source_blend) {
   2009          generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels,
   2010                              fs_src1, src1, pad_inline);
   2011       }
   2012    } else {
   2013       src_count = num_fullblock_fs * dst_channels;
   2014       /*
   2015        * We reorder things a bit here, so the cases for 4-wide and 8-wide
   2016        * (AVX) turn out the same later when untwiddling/transpose (albeit
   2017        * for true AVX2 path untwiddle needs to be different).
   2018        * For now just order by colors first (so we can use unpack later).
   2019        */
   2020       for (j = 0; j < num_fullblock_fs; j++) {
   2021          for (i = 0; i < dst_channels; i++) {
   2022             src[i*num_fullblock_fs + j] = fs_src[j][i];
   2023             if (dual_source_blend) {
   2024                src1[i*num_fullblock_fs + j] = fs_src1[j][i];
   2025             }
   2026          }
   2027       }
   2028    }
   2029 
   2030    src_channels = dst_channels < 3 ? dst_channels : 4;
   2031    if (src_count != num_fullblock_fs * src_channels) {
   2032       unsigned ds = src_count / (num_fullblock_fs * src_channels);
   2033       row_type.length /= ds;
   2034       fs_type.length = row_type.length;
   2035    }
   2036 
   2037    blend_type = row_type;
   2038    mask_type.length = 4;
   2039 
   2040    /* Convert src to row_type */
   2041    if (dual_source_blend) {
   2042       struct lp_type old_row_type = row_type;
   2043       lp_build_conv_auto(gallivm, fs_type, &row_type, src, src_count, src);
   2044       src_count = lp_build_conv_auto(gallivm, fs_type, &old_row_type, src1, src_count, src1);
   2045    }
   2046    else {
   2047       src_count = lp_build_conv_auto(gallivm, fs_type, &row_type, src, src_count, src);
   2048    }
   2049 
   2050    /* If the rows are not an SSE vector, combine them to become SSE size! */
   2051    if ((row_type.width * row_type.length) % 128) {
   2052       unsigned bits = row_type.width * row_type.length;
   2053       unsigned combined;
   2054 
   2055       assert(src_count >= (vector_width / bits));
   2056 
   2057       dst_count = src_count / (vector_width / bits);
   2058 
   2059       combined = lp_build_concat_n(gallivm, row_type, src, src_count, src, dst_count);
   2060       if (dual_source_blend) {
   2061          lp_build_concat_n(gallivm, row_type, src1, src_count, src1, dst_count);
   2062       }
   2063 
   2064       row_type.length *= combined;
   2065       src_count /= combined;
   2066 
   2067       bits = row_type.width * row_type.length;
   2068       assert(bits == 128 || bits == 256);
   2069    }
   2070 
   2071    if (twiddle_after_convert) {
   2072       fs_twiddle_transpose(gallivm, row_type, src, src_count, src);
   2073       if (dual_source_blend) {
   2074          fs_twiddle_transpose(gallivm, row_type, src1, src_count, src1);
   2075       }
   2076    }
   2077 
   2078    /*
   2079     * Blend Colour conversion
   2080     */
   2081    blend_color = lp_jit_context_f_blend_color(gallivm, context_ptr);
   2082    blend_color = LLVMBuildPointerCast(builder, blend_color,
   2083                     LLVMPointerType(lp_build_vec_type(gallivm, fs_type), 0), "");
   2084    blend_color = LLVMBuildLoad(builder, LLVMBuildGEP(builder, blend_color,
   2085                                &i32_zero, 1, ""), "");
   2086 
   2087    /* Convert */
   2088    lp_build_conv(gallivm, fs_type, blend_type, &blend_color, 1, &blend_color, 1);
   2089 
   2090    if (out_format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
   2091       /*
   2092        * since blending is done with floats, there was no conversion.
   2093        * However, the rules according to fixed point renderbuffers still
   2094        * apply, that is we must clamp inputs to 0.0/1.0.
   2095        * (This would apply to separate alpha conversion too but we currently
   2096        * force has_alpha to be true.)
   2097        * TODO: should skip this with "fake" blend, since post-blend conversion
   2098        * will clamp anyway.
   2099        * TODO: could also skip this if fragment color clamping is enabled. We
   2100        * don't support it natively so it gets baked into the shader however, so
   2101        * can't really tell here.
   2102        */
   2103       struct lp_build_context f32_bld;
   2104       assert(row_type.floating);
   2105       lp_build_context_init(&f32_bld, gallivm, row_type);
   2106       for (i = 0; i < src_count; i++) {
   2107          src[i] = lp_build_clamp_zero_one_nanzero(&f32_bld, src[i]);
   2108       }
   2109       if (dual_source_blend) {
   2110          for (i = 0; i < src_count; i++) {
   2111             src1[i] = lp_build_clamp_zero_one_nanzero(&f32_bld, src1[i]);
   2112          }
   2113       }
   2114       /* probably can't be different than row_type but better safe than sorry... */
   2115       lp_build_context_init(&f32_bld, gallivm, blend_type);
   2116       blend_color = lp_build_clamp(&f32_bld, blend_color, f32_bld.zero, f32_bld.one);
   2117    }
   2118 
   2119    /* Extract alpha */
   2120    blend_alpha = lp_build_extract_broadcast(gallivm, blend_type, row_type, blend_color, lp_build_const_int32(gallivm, 3));
   2121 
   2122    /* Swizzle to appropriate channels, e.g. from RGBA to BGRA BGRA */
   2123    pad_inline &= (dst_channels * (block_size / src_count) * row_type.width) != vector_width;
   2124    if (pad_inline) {
   2125       /* Use all 4 channels e.g. from RGBA RGBA to RGxx RGxx */
   2126       blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle, TGSI_NUM_CHANNELS, row_type.length);
   2127    } else {
   2128       /* Only use dst_channels e.g. RGBA RGBA to RG RG xxxx */
   2129       blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle, dst_channels, row_type.length);
   2130    }
   2131 
   2132    /*
   2133     * Mask conversion
   2134     */
   2135    lp_bld_quad_twiddle(gallivm, mask_type, &src_mask[0], block_height, &src_mask[0]);
   2136 
   2137    if (src_count < block_height) {
   2138       lp_build_concat_n(gallivm, mask_type, src_mask, 4, src_mask, src_count);
   2139    } else if (src_count > block_height) {
   2140       for (i = src_count; i > 0; --i) {
   2141          unsigned pixels = block_size / src_count;
   2142          unsigned idx = i - 1;
   2143 
   2144          src_mask[idx] = lp_build_extract_range(gallivm, src_mask[(idx * pixels) / 4],
   2145                                                 (idx * pixels) % 4, pixels);
   2146       }
   2147    }
   2148 
   2149    assert(mask_type.width == 32);
   2150 
   2151    for (i = 0; i < src_count; ++i) {
   2152       unsigned pixels = block_size / src_count;
   2153       unsigned pixel_width = row_type.width * dst_channels;
   2154 
   2155       if (pixel_width == 24) {
   2156          mask_type.width = 8;
   2157          mask_type.length = vector_width / mask_type.width;
   2158       } else {
   2159          mask_type.length = pixels;
   2160          mask_type.width = row_type.width * dst_channels;
   2161 
   2162          /*
   2163           * If mask_type width is smaller than 32bit, this doesn't quite
   2164           * generate the most efficient code (could use some pack).
   2165           */
   2166          src_mask[i] = LLVMBuildIntCast(builder, src_mask[i],
   2167                                         lp_build_int_vec_type(gallivm, mask_type), "");
   2168 
   2169          mask_type.length *= dst_channels;
   2170          mask_type.width /= dst_channels;
   2171       }
   2172 
   2173       src_mask[i] = LLVMBuildBitCast(builder, src_mask[i],
   2174                                      lp_build_int_vec_type(gallivm, mask_type), "");
   2175       src_mask[i] = lp_build_pad_vector(gallivm, src_mask[i], row_type.length);
   2176    }
   2177 
   2178    /*
   2179     * Alpha conversion
   2180     */
   2181    if (!has_alpha) {
   2182       struct lp_type alpha_type = fs_type;
   2183       alpha_type.length = 4;
   2184       convert_alpha(gallivm, row_type, alpha_type,
   2185                     block_size, block_height,
   2186                     src_count, dst_channels,
   2187                     pad_inline, src_alpha);
   2188       if (dual_source_blend) {
   2189          convert_alpha(gallivm, row_type, alpha_type,
   2190                        block_size, block_height,
   2191                        src_count, dst_channels,
   2192                        pad_inline, src1_alpha);
   2193       }
   2194    }
   2195 
   2196 
   2197    /*
   2198     * Load dst from memory
   2199     */
   2200    if (src_count < block_height) {
   2201       dst_count = block_height;
   2202    } else {
   2203       dst_count = src_count;
   2204    }
   2205 
   2206    dst_type.length *= block_size / dst_count;
   2207 
   2208    if (format_expands_to_float_soa(out_format_desc)) {
   2209       /*
   2210        * we need multiple values at once for the conversion, so can as well
   2211        * load them vectorized here too instead of concatenating later.
   2212        * (Still need concatenation later for 8-wide vectors).
   2213        */
   2214       dst_count = block_height;
   2215       dst_type.length = block_width;
   2216    }
   2217 
   2218    /*
   2219     * Compute the alignment of the destination pointer in bytes
   2220     * We fetch 1-4 pixels, if the format has pot alignment then those fetches
   2221     * are always aligned by MIN2(16, fetch_width) except for buffers (not
   2222     * 1d tex but can't distinguish here) so need to stick with per-pixel
   2223     * alignment in this case.
   2224     */
   2225    if (is_1d) {
   2226       dst_alignment = (out_format_desc->block.bits + 7)/(out_format_desc->block.width * 8);
   2227    }
   2228    else {
   2229       dst_alignment = dst_type.length * dst_type.width / 8;
   2230    }
   2231    /* Force power-of-two alignment by extracting only the least-significant-bit */
   2232    dst_alignment = 1 << (ffs(dst_alignment) - 1);
   2233    /*
   2234     * Resource base and stride pointers are aligned to 16 bytes, so that's
   2235     * the maximum alignment we can guarantee
   2236     */
   2237    dst_alignment = MIN2(16, dst_alignment);
   2238 
   2239    ls_type = dst_type;
   2240 
   2241    if (dst_count > src_count) {
   2242       if ((dst_type.width == 8 || dst_type.width == 16) &&
   2243           util_is_power_of_two(dst_type.length) &&
   2244           dst_type.length * dst_type.width < 128) {
   2245          /*
   2246           * Never try to load values as 4xi8 which we will then
   2247           * concatenate to larger vectors. This gives llvm a real
   2248           * headache (the problem is the type legalizer (?) will
   2249           * try to load that as 4xi8 zext to 4xi32 to fill the vector,
   2250           * then the shuffles to concatenate are more or less impossible
   2251           * - llvm is easily capable of generating a sequence of 32
   2252           * pextrb/pinsrb instructions for that. Albeit it appears to
   2253           * be fixed in llvm 4.0. So, load and concatenate with 32bit
   2254           * width to avoid the trouble (16bit seems not as bad, llvm
   2255           * probably recognizes the load+shuffle as only one shuffle
   2256           * is necessary, but we can do just the same anyway).
   2257           */
   2258          ls_type.length = dst_type.length * dst_type.width / 32;
   2259          ls_type.width = 32;
   2260       }
   2261    }
   2262 
   2263    if (is_1d) {
   2264       load_unswizzled_block(gallivm, color_ptr, stride, block_width, 1,
   2265                             dst, ls_type, dst_count / 4, dst_alignment);
   2266       for (i = dst_count / 4; i < dst_count; i++) {
   2267          dst[i] = lp_build_undef(gallivm, ls_type);
   2268       }
   2269 
   2270    }
   2271    else {
   2272       load_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height,
   2273                             dst, ls_type, dst_count, dst_alignment);
   2274    }
   2275 
   2276 
   2277    /*
   2278     * Convert from dst/output format to src/blending format.
   2279     *
   2280     * This is necessary as we can only read 1 row from memory at a time,
   2281     * so the minimum dst_count will ever be at this point is 4.
   2282     *
   2283     * With, for example, R8 format you can have all 16 pixels in a 128 bit vector,
   2284     * this will take the 4 dsts and combine them into 1 src so we can perform blending
   2285     * on all 16 pixels in that single vector at once.
   2286     */
   2287    if (dst_count > src_count) {
   2288       if (ls_type.length != dst_type.length && ls_type.length == 1) {
   2289          LLVMTypeRef elem_type = lp_build_elem_type(gallivm, ls_type);
   2290          LLVMTypeRef ls_vec_type = LLVMVectorType(elem_type, 1);
   2291          for (i = 0; i < dst_count; i++) {
   2292             dst[i] = LLVMBuildBitCast(builder, dst[i], ls_vec_type, "");
   2293          }
   2294       }
   2295 
   2296       lp_build_concat_n(gallivm, ls_type, dst, 4, dst, src_count);
   2297 
   2298       if (ls_type.length != dst_type.length) {
   2299          struct lp_type tmp_type = dst_type;
   2300          tmp_type.length = dst_type.length * 4 / src_count;
   2301          for (i = 0; i < src_count; i++) {
   2302             dst[i] = LLVMBuildBitCast(builder, dst[i],
   2303                                       lp_build_vec_type(gallivm, tmp_type), "");
   2304          }
   2305       }
   2306    }
   2307 
   2308    /*
   2309     * Blending
   2310     */
   2311    /* XXX this is broken for RGB8 formats -
   2312     * they get expanded from 12 to 16 elements (to include alpha)
   2313     * by convert_to_blend_type then reduced to 15 instead of 12
   2314     * by convert_from_blend_type (a simple fix though breaks A8...).
   2315     * R16G16B16 also crashes differently however something going wrong
   2316     * inside llvm handling npot vector sizes seemingly.
   2317     * It seems some cleanup could be done here (like skipping conversion/blend
   2318     * when not needed).
   2319     */
   2320    convert_to_blend_type(gallivm, block_size, out_format_desc, dst_type,
   2321                          row_type, dst, src_count);
   2322 
   2323    /*
   2324     * FIXME: Really should get logic ops / masks out of generic blend / row
   2325     * format. Logic ops will definitely not work on the blend float format
   2326     * used for SRGB here and I think OpenGL expects this to work as expected
   2327     * (that is incoming values converted to srgb then logic op applied).
   2328     */
   2329    for (i = 0; i < src_count; ++i) {
   2330       dst[i] = lp_build_blend_aos(gallivm,
   2331                                   &variant->key.blend,
   2332                                   out_format,
   2333                                   row_type,
   2334                                   rt,
   2335                                   src[i],
   2336                                   has_alpha ? NULL : src_alpha[i],
   2337                                   src1[i],
   2338                                   has_alpha ? NULL : src1_alpha[i],
   2339                                   dst[i],
   2340                                   partial_mask ? src_mask[i] : NULL,
   2341                                   blend_color,
   2342                                   has_alpha ? NULL : blend_alpha,
   2343                                   swizzle,
   2344                                   pad_inline ? 4 : dst_channels);
   2345    }
   2346 
   2347    convert_from_blend_type(gallivm, block_size, out_format_desc,
   2348                            row_type, dst_type, dst, src_count);
   2349 
   2350    /* Split the blend rows back to memory rows */
   2351    if (dst_count > src_count) {
   2352       row_type.length = dst_type.length * (dst_count / src_count);
   2353 
   2354       if (src_count == 1) {
   2355          dst[1] = lp_build_extract_range(gallivm, dst[0], row_type.length / 2, row_type.length / 2);
   2356          dst[0] = lp_build_extract_range(gallivm, dst[0], 0, row_type.length / 2);
   2357 
   2358          row_type.length /= 2;
   2359          src_count *= 2;
   2360       }
   2361 
   2362       dst[3] = lp_build_extract_range(gallivm, dst[1], row_type.length / 2, row_type.length / 2);
   2363       dst[2] = lp_build_extract_range(gallivm, dst[1], 0, row_type.length / 2);
   2364       dst[1] = lp_build_extract_range(gallivm, dst[0], row_type.length / 2, row_type.length / 2);
   2365       dst[0] = lp_build_extract_range(gallivm, dst[0], 0, row_type.length / 2);
   2366 
   2367       row_type.length /= 2;
   2368       src_count *= 2;
   2369    }
   2370 
   2371    /*
   2372     * Store blend result to memory
   2373     */
   2374    if (is_1d) {
   2375       store_unswizzled_block(gallivm, color_ptr, stride, block_width, 1,
   2376                              dst, dst_type, dst_count / 4, dst_alignment);
   2377    }
   2378    else {
   2379       store_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height,
   2380                              dst, dst_type, dst_count, dst_alignment);
   2381    }
   2382 
   2383    if (have_smallfloat_format(dst_type, out_format)) {
   2384       lp_build_fpstate_set(gallivm, fpstate);
   2385    }
   2386 
   2387    if (do_branch) {
   2388       lp_build_mask_end(&mask_ctx);
   2389    }
   2390 }
   2391 
   2392 
   2393 /**
   2394  * Generate the runtime callable function for the whole fragment pipeline.
   2395  * Note that the function which we generate operates on a block of 16
   2396  * pixels at at time.  The block contains 2x2 quads.  Each quad contains
   2397  * 2x2 pixels.
   2398  */
   2399 static void
   2400 generate_fragment(struct llvmpipe_context *lp,
   2401                   struct lp_fragment_shader *shader,
   2402                   struct lp_fragment_shader_variant *variant,
   2403                   unsigned partial_mask)
   2404 {
   2405    struct gallivm_state *gallivm = variant->gallivm;
   2406    const struct lp_fragment_shader_variant_key *key = &variant->key;
   2407    struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS];
   2408    char func_name[64];
   2409    struct lp_type fs_type;
   2410    struct lp_type blend_type;
   2411    LLVMTypeRef fs_elem_type;
   2412    LLVMTypeRef blend_vec_type;
   2413    LLVMTypeRef arg_types[13];
   2414    LLVMTypeRef func_type;
   2415    LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context);
   2416    LLVMTypeRef int8_type = LLVMInt8TypeInContext(gallivm->context);
   2417    LLVMValueRef context_ptr;
   2418    LLVMValueRef x;
   2419    LLVMValueRef y;
   2420    LLVMValueRef a0_ptr;
   2421    LLVMValueRef dadx_ptr;
   2422    LLVMValueRef dady_ptr;
   2423    LLVMValueRef color_ptr_ptr;
   2424    LLVMValueRef stride_ptr;
   2425    LLVMValueRef depth_ptr;
   2426    LLVMValueRef depth_stride;
   2427    LLVMValueRef mask_input;
   2428    LLVMValueRef thread_data_ptr;
   2429    LLVMBasicBlockRef block;
   2430    LLVMBuilderRef builder;
   2431    struct lp_build_sampler_soa *sampler;
   2432    struct lp_build_interp_soa_context interp;
   2433    LLVMValueRef fs_mask[16 / 4];
   2434    LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][16 / 4];
   2435    LLVMValueRef function;
   2436    LLVMValueRef facing;
   2437    unsigned num_fs;
   2438    unsigned i;
   2439    unsigned chan;
   2440    unsigned cbuf;
   2441    boolean cbuf0_write_all;
   2442    const boolean dual_source_blend = key->blend.rt[0].blend_enable &&
   2443                                      util_blend_state_is_dual(&key->blend, 0);
   2444 
   2445    assert(lp_native_vector_width / 32 >= 4);
   2446 
   2447    /* Adjust color input interpolation according to flatshade state:
   2448     */
   2449    memcpy(inputs, shader->inputs, shader->info.base.num_inputs * sizeof inputs[0]);
   2450    for (i = 0; i < shader->info.base.num_inputs; i++) {
   2451       if (inputs[i].interp == LP_INTERP_COLOR) {
   2452 	 if (key->flatshade)
   2453 	    inputs[i].interp = LP_INTERP_CONSTANT;
   2454 	 else
   2455 	    inputs[i].interp = LP_INTERP_PERSPECTIVE;
   2456       }
   2457    }
   2458 
   2459    /* check if writes to cbuf[0] are to be copied to all cbufs */
   2460    cbuf0_write_all =
   2461      shader->info.base.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS];
   2462 
   2463    /* TODO: actually pick these based on the fs and color buffer
   2464     * characteristics. */
   2465 
   2466    memset(&fs_type, 0, sizeof fs_type);
   2467    fs_type.floating = TRUE;      /* floating point values */
   2468    fs_type.sign = TRUE;          /* values are signed */
   2469    fs_type.norm = FALSE;         /* values are not limited to [0,1] or [-1,1] */
   2470    fs_type.width = 32;           /* 32-bit float */
   2471    fs_type.length = MIN2(lp_native_vector_width / 32, 16); /* n*4 elements per vector */
   2472 
   2473    memset(&blend_type, 0, sizeof blend_type);
   2474    blend_type.floating = FALSE; /* values are integers */
   2475    blend_type.sign = FALSE;     /* values are unsigned */
   2476    blend_type.norm = TRUE;      /* values are in [0,1] or [-1,1] */
   2477    blend_type.width = 8;        /* 8-bit ubyte values */
   2478    blend_type.length = 16;      /* 16 elements per vector */
   2479 
   2480    /*
   2481     * Generate the function prototype. Any change here must be reflected in
   2482     * lp_jit.h's lp_jit_frag_func function pointer type, and vice-versa.
   2483     */
   2484 
   2485    fs_elem_type = lp_build_elem_type(gallivm, fs_type);
   2486 
   2487    blend_vec_type = lp_build_vec_type(gallivm, blend_type);
   2488 
   2489    util_snprintf(func_name, sizeof(func_name), "fs%u_variant%u_%s",
   2490                  shader->no, variant->no, partial_mask ? "partial" : "whole");
   2491 
   2492    arg_types[0] = variant->jit_context_ptr_type;       /* context */
   2493    arg_types[1] = int32_type;                          /* x */
   2494    arg_types[2] = int32_type;                          /* y */
   2495    arg_types[3] = int32_type;                          /* facing */
   2496    arg_types[4] = LLVMPointerType(fs_elem_type, 0);    /* a0 */
   2497    arg_types[5] = LLVMPointerType(fs_elem_type, 0);    /* dadx */
   2498    arg_types[6] = LLVMPointerType(fs_elem_type, 0);    /* dady */
   2499    arg_types[7] = LLVMPointerType(LLVMPointerType(blend_vec_type, 0), 0);  /* color */
   2500    arg_types[8] = LLVMPointerType(int8_type, 0);       /* depth */
   2501    arg_types[9] = int32_type;                          /* mask_input */
   2502    arg_types[10] = variant->jit_thread_data_ptr_type;  /* per thread data */
   2503    arg_types[11] = LLVMPointerType(int32_type, 0);     /* stride */
   2504    arg_types[12] = int32_type;                         /* depth_stride */
   2505 
   2506    func_type = LLVMFunctionType(LLVMVoidTypeInContext(gallivm->context),
   2507                                 arg_types, ARRAY_SIZE(arg_types), 0);
   2508 
   2509    function = LLVMAddFunction(gallivm->module, func_name, func_type);
   2510    LLVMSetFunctionCallConv(function, LLVMCCallConv);
   2511 
   2512    variant->function[partial_mask] = function;
   2513 
   2514    /* XXX: need to propagate noalias down into color param now we are
   2515     * passing a pointer-to-pointer?
   2516     */
   2517    for(i = 0; i < ARRAY_SIZE(arg_types); ++i)
   2518       if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind)
   2519          lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
   2520 
   2521    context_ptr  = LLVMGetParam(function, 0);
   2522    x            = LLVMGetParam(function, 1);
   2523    y            = LLVMGetParam(function, 2);
   2524    facing       = LLVMGetParam(function, 3);
   2525    a0_ptr       = LLVMGetParam(function, 4);
   2526    dadx_ptr     = LLVMGetParam(function, 5);
   2527    dady_ptr     = LLVMGetParam(function, 6);
   2528    color_ptr_ptr = LLVMGetParam(function, 7);
   2529    depth_ptr    = LLVMGetParam(function, 8);
   2530    mask_input   = LLVMGetParam(function, 9);
   2531    thread_data_ptr  = LLVMGetParam(function, 10);
   2532    stride_ptr   = LLVMGetParam(function, 11);
   2533    depth_stride = LLVMGetParam(function, 12);
   2534 
   2535    lp_build_name(context_ptr, "context");
   2536    lp_build_name(x, "x");
   2537    lp_build_name(y, "y");
   2538    lp_build_name(a0_ptr, "a0");
   2539    lp_build_name(dadx_ptr, "dadx");
   2540    lp_build_name(dady_ptr, "dady");
   2541    lp_build_name(color_ptr_ptr, "color_ptr_ptr");
   2542    lp_build_name(depth_ptr, "depth");
   2543    lp_build_name(mask_input, "mask_input");
   2544    lp_build_name(thread_data_ptr, "thread_data");
   2545    lp_build_name(stride_ptr, "stride_ptr");
   2546    lp_build_name(depth_stride, "depth_stride");
   2547 
   2548    /*
   2549     * Function body
   2550     */
   2551 
   2552    block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
   2553    builder = gallivm->builder;
   2554    assert(builder);
   2555    LLVMPositionBuilderAtEnd(builder, block);
   2556 
   2557    /* code generated texture sampling */
   2558    sampler = lp_llvm_sampler_soa_create(key->state);
   2559 
   2560    num_fs = 16 / fs_type.length; /* number of loops per 4x4 stamp */
   2561    /* for 1d resources only run "upper half" of stamp */
   2562    if (key->resource_1d)
   2563       num_fs /= 2;
   2564 
   2565    {
   2566       LLVMValueRef num_loop = lp_build_const_int32(gallivm, num_fs);
   2567       LLVMTypeRef mask_type = lp_build_int_vec_type(gallivm, fs_type);
   2568       LLVMValueRef mask_store = lp_build_array_alloca(gallivm, mask_type,
   2569                                                       num_loop, "mask_store");
   2570       LLVMValueRef color_store[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS];
   2571       boolean pixel_center_integer =
   2572          shader->info.base.properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER];
   2573 
   2574       /*
   2575        * The shader input interpolation info is not explicitely baked in the
   2576        * shader key, but everything it derives from (TGSI, and flatshade) is
   2577        * already included in the shader key.
   2578        */
   2579       lp_build_interp_soa_init(&interp,
   2580                                gallivm,
   2581                                shader->info.base.num_inputs,
   2582                                inputs,
   2583                                pixel_center_integer,
   2584                                key->depth_clamp,
   2585                                builder, fs_type,
   2586                                a0_ptr, dadx_ptr, dady_ptr,
   2587                                x, y);
   2588 
   2589       for (i = 0; i < num_fs; i++) {
   2590          LLVMValueRef mask;
   2591          LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
   2592          LLVMValueRef mask_ptr = LLVMBuildGEP(builder, mask_store,
   2593                                               &indexi, 1, "mask_ptr");
   2594 
   2595          if (partial_mask) {
   2596             mask = generate_quad_mask(gallivm, fs_type,
   2597                                       i*fs_type.length/4, mask_input);
   2598          }
   2599          else {
   2600             mask = lp_build_const_int_vec(gallivm, fs_type, ~0);
   2601          }
   2602          LLVMBuildStore(builder, mask, mask_ptr);
   2603       }
   2604 
   2605       generate_fs_loop(gallivm,
   2606                        shader, key,
   2607                        builder,
   2608                        fs_type,
   2609                        context_ptr,
   2610                        num_loop,
   2611                        &interp,
   2612                        sampler,
   2613                        mask_store, /* output */
   2614                        color_store,
   2615                        depth_ptr,
   2616                        depth_stride,
   2617                        facing,
   2618                        thread_data_ptr);
   2619 
   2620       for (i = 0; i < num_fs; i++) {
   2621          LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
   2622          LLVMValueRef ptr = LLVMBuildGEP(builder, mask_store,
   2623                                          &indexi, 1, "");
   2624          fs_mask[i] = LLVMBuildLoad(builder, ptr, "mask");
   2625          /* This is fucked up need to reorganize things */
   2626          for (cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
   2627             for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
   2628                ptr = LLVMBuildGEP(builder,
   2629                                   color_store[cbuf * !cbuf0_write_all][chan],
   2630                                   &indexi, 1, "");
   2631                fs_out_color[cbuf][chan][i] = ptr;
   2632             }
   2633          }
   2634          if (dual_source_blend) {
   2635             /* only support one dual source blend target hence always use output 1 */
   2636             for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
   2637                ptr = LLVMBuildGEP(builder,
   2638                                   color_store[1][chan],
   2639                                   &indexi, 1, "");
   2640                fs_out_color[1][chan][i] = ptr;
   2641             }
   2642          }
   2643       }
   2644    }
   2645 
   2646    sampler->destroy(sampler);
   2647 
   2648    /* Loop over color outputs / color buffers to do blending.
   2649     */
   2650    for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
   2651       if (key->cbuf_format[cbuf] != PIPE_FORMAT_NONE) {
   2652          LLVMValueRef color_ptr;
   2653          LLVMValueRef stride;
   2654          LLVMValueRef index = lp_build_const_int32(gallivm, cbuf);
   2655 
   2656          boolean do_branch = ((key->depth.enabled
   2657                                || key->stencil[0].enabled
   2658                                || key->alpha.enabled)
   2659                               && !shader->info.base.uses_kill);
   2660 
   2661          color_ptr = LLVMBuildLoad(builder,
   2662                                    LLVMBuildGEP(builder, color_ptr_ptr,
   2663                                                 &index, 1, ""),
   2664                                    "");
   2665 
   2666          lp_build_name(color_ptr, "color_ptr%d", cbuf);
   2667 
   2668          stride = LLVMBuildLoad(builder,
   2669                                 LLVMBuildGEP(builder, stride_ptr, &index, 1, ""),
   2670                                 "");
   2671 
   2672          generate_unswizzled_blend(gallivm, cbuf, variant,
   2673                                    key->cbuf_format[cbuf],
   2674                                    num_fs, fs_type, fs_mask, fs_out_color,
   2675                                    context_ptr, color_ptr, stride,
   2676                                    partial_mask, do_branch);
   2677       }
   2678    }
   2679 
   2680    LLVMBuildRetVoid(builder);
   2681 
   2682    gallivm_verify_function(gallivm, function);
   2683 }
   2684 
   2685 
   2686 static void
   2687 dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
   2688 {
   2689    unsigned i;
   2690 
   2691    debug_printf("fs variant %p:\n", (void *) key);
   2692 
   2693    if (key->flatshade) {
   2694       debug_printf("flatshade = 1\n");
   2695    }
   2696    for (i = 0; i < key->nr_cbufs; ++i) {
   2697       debug_printf("cbuf_format[%u] = %s\n", i, util_format_name(key->cbuf_format[i]));
   2698    }
   2699    if (key->depth.enabled || key->stencil[0].enabled) {
   2700       debug_printf("depth.format = %s\n", util_format_name(key->zsbuf_format));
   2701    }
   2702    if (key->depth.enabled) {
   2703       debug_printf("depth.func = %s\n", util_str_func(key->depth.func, TRUE));
   2704       debug_printf("depth.writemask = %u\n", key->depth.writemask);
   2705    }
   2706 
   2707    for (i = 0; i < 2; ++i) {
   2708       if (key->stencil[i].enabled) {
   2709          debug_printf("stencil[%u].func = %s\n", i, util_str_func(key->stencil[i].func, TRUE));
   2710          debug_printf("stencil[%u].fail_op = %s\n", i, util_str_stencil_op(key->stencil[i].fail_op, TRUE));
   2711          debug_printf("stencil[%u].zpass_op = %s\n", i, util_str_stencil_op(key->stencil[i].zpass_op, TRUE));
   2712          debug_printf("stencil[%u].zfail_op = %s\n", i, util_str_stencil_op(key->stencil[i].zfail_op, TRUE));
   2713          debug_printf("stencil[%u].valuemask = 0x%x\n", i, key->stencil[i].valuemask);
   2714          debug_printf("stencil[%u].writemask = 0x%x\n", i, key->stencil[i].writemask);
   2715       }
   2716    }
   2717 
   2718    if (key->alpha.enabled) {
   2719       debug_printf("alpha.func = %s\n", util_str_func(key->alpha.func, TRUE));
   2720    }
   2721 
   2722    if (key->occlusion_count) {
   2723       debug_printf("occlusion_count = 1\n");
   2724    }
   2725 
   2726    if (key->blend.logicop_enable) {
   2727       debug_printf("blend.logicop_func = %s\n", util_str_logicop(key->blend.logicop_func, TRUE));
   2728    }
   2729    else if (key->blend.rt[0].blend_enable) {
   2730       debug_printf("blend.rgb_func = %s\n",   util_str_blend_func  (key->blend.rt[0].rgb_func, TRUE));
   2731       debug_printf("blend.rgb_src_factor = %s\n",   util_str_blend_factor(key->blend.rt[0].rgb_src_factor, TRUE));
   2732       debug_printf("blend.rgb_dst_factor = %s\n",   util_str_blend_factor(key->blend.rt[0].rgb_dst_factor, TRUE));
   2733       debug_printf("blend.alpha_func = %s\n",       util_str_blend_func  (key->blend.rt[0].alpha_func, TRUE));
   2734       debug_printf("blend.alpha_src_factor = %s\n", util_str_blend_factor(key->blend.rt[0].alpha_src_factor, TRUE));
   2735       debug_printf("blend.alpha_dst_factor = %s\n", util_str_blend_factor(key->blend.rt[0].alpha_dst_factor, TRUE));
   2736    }
   2737    debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask);
   2738    if (key->blend.alpha_to_coverage) {
   2739       debug_printf("blend.alpha_to_coverage is enabled\n");
   2740    }
   2741    for (i = 0; i < key->nr_samplers; ++i) {
   2742       const struct lp_static_sampler_state *sampler = &key->state[i].sampler_state;
   2743       debug_printf("sampler[%u] = \n", i);
   2744       debug_printf("  .wrap = %s %s %s\n",
   2745                    util_str_tex_wrap(sampler->wrap_s, TRUE),
   2746                    util_str_tex_wrap(sampler->wrap_t, TRUE),
   2747                    util_str_tex_wrap(sampler->wrap_r, TRUE));
   2748       debug_printf("  .min_img_filter = %s\n",
   2749                    util_str_tex_filter(sampler->min_img_filter, TRUE));
   2750       debug_printf("  .min_mip_filter = %s\n",
   2751                    util_str_tex_mipfilter(sampler->min_mip_filter, TRUE));
   2752       debug_printf("  .mag_img_filter = %s\n",
   2753                    util_str_tex_filter(sampler->mag_img_filter, TRUE));
   2754       if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE)
   2755          debug_printf("  .compare_func = %s\n", util_str_func(sampler->compare_func, TRUE));
   2756       debug_printf("  .normalized_coords = %u\n", sampler->normalized_coords);
   2757       debug_printf("  .min_max_lod_equal = %u\n", sampler->min_max_lod_equal);
   2758       debug_printf("  .lod_bias_non_zero = %u\n", sampler->lod_bias_non_zero);
   2759       debug_printf("  .apply_min_lod = %u\n", sampler->apply_min_lod);
   2760       debug_printf("  .apply_max_lod = %u\n", sampler->apply_max_lod);
   2761    }
   2762    for (i = 0; i < key->nr_sampler_views; ++i) {
   2763       const struct lp_static_texture_state *texture = &key->state[i].texture_state;
   2764       debug_printf("texture[%u] = \n", i);
   2765       debug_printf("  .format = %s\n",
   2766                    util_format_name(texture->format));
   2767       debug_printf("  .target = %s\n",
   2768                    util_str_tex_target(texture->target, TRUE));
   2769       debug_printf("  .level_zero_only = %u\n",
   2770                    texture->level_zero_only);
   2771       debug_printf("  .pot = %u %u %u\n",
   2772                    texture->pot_width,
   2773                    texture->pot_height,
   2774                    texture->pot_depth);
   2775    }
   2776 }
   2777 
   2778 
   2779 void
   2780 lp_debug_fs_variant(const struct lp_fragment_shader_variant *variant)
   2781 {
   2782    debug_printf("llvmpipe: Fragment shader #%u variant #%u:\n",
   2783                 variant->shader->no, variant->no);
   2784    tgsi_dump(variant->shader->base.tokens, 0);
   2785    dump_fs_variant_key(&variant->key);
   2786    debug_printf("variant->opaque = %u\n", variant->opaque);
   2787    debug_printf("\n");
   2788 }
   2789 
   2790 
   2791 /**
   2792  * Generate a new fragment shader variant from the shader code and
   2793  * other state indicated by the key.
   2794  */
   2795 static struct lp_fragment_shader_variant *
   2796 generate_variant(struct llvmpipe_context *lp,
   2797                  struct lp_fragment_shader *shader,
   2798                  const struct lp_fragment_shader_variant_key *key)
   2799 {
   2800    struct lp_fragment_shader_variant *variant;
   2801    const struct util_format_description *cbuf0_format_desc = NULL;
   2802    boolean fullcolormask;
   2803    char module_name[64];
   2804 
   2805    variant = CALLOC_STRUCT(lp_fragment_shader_variant);
   2806    if (!variant)
   2807       return NULL;
   2808 
   2809    util_snprintf(module_name, sizeof(module_name), "fs%u_variant%u",
   2810                  shader->no, shader->variants_created);
   2811 
   2812    variant->gallivm = gallivm_create(module_name, lp->context);
   2813    if (!variant->gallivm) {
   2814       FREE(variant);
   2815       return NULL;
   2816    }
   2817 
   2818    variant->shader = shader;
   2819    variant->list_item_global.base = variant;
   2820    variant->list_item_local.base = variant;
   2821    variant->no = shader->variants_created++;
   2822 
   2823    memcpy(&variant->key, key, shader->variant_key_size);
   2824 
   2825    /*
   2826     * Determine whether we are touching all channels in the color buffer.
   2827     */
   2828    fullcolormask = FALSE;
   2829    if (key->nr_cbufs == 1) {
   2830       cbuf0_format_desc = util_format_description(key->cbuf_format[0]);
   2831       fullcolormask = util_format_colormask_full(cbuf0_format_desc, key->blend.rt[0].colormask);
   2832    }
   2833 
   2834    variant->opaque =
   2835          !key->blend.logicop_enable &&
   2836          !key->blend.rt[0].blend_enable &&
   2837          fullcolormask &&
   2838          !key->stencil[0].enabled &&
   2839          !key->alpha.enabled &&
   2840          !key->blend.alpha_to_coverage &&
   2841          !key->depth.enabled &&
   2842          !shader->info.base.uses_kill &&
   2843          !shader->info.base.writes_samplemask
   2844       ? TRUE : FALSE;
   2845 
   2846    if ((shader->info.base.num_tokens <= 1) &&
   2847        !key->depth.enabled && !key->stencil[0].enabled) {
   2848       variant->ps_inv_multiplier = 0;
   2849    } else {
   2850       variant->ps_inv_multiplier = 1;
   2851    }
   2852 
   2853    if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
   2854       lp_debug_fs_variant(variant);
   2855    }
   2856 
   2857    lp_jit_init_types(variant);
   2858 
   2859    if (variant->jit_function[RAST_EDGE_TEST] == NULL)
   2860       generate_fragment(lp, shader, variant, RAST_EDGE_TEST);
   2861 
   2862    if (variant->jit_function[RAST_WHOLE] == NULL) {
   2863       if (variant->opaque) {
   2864          /* Specialized shader, which doesn't need to read the color buffer. */
   2865          generate_fragment(lp, shader, variant, RAST_WHOLE);
   2866       }
   2867    }
   2868 
   2869    /*
   2870     * Compile everything
   2871     */
   2872 
   2873    gallivm_compile_module(variant->gallivm);
   2874 
   2875    variant->nr_instrs += lp_build_count_ir_module(variant->gallivm->module);
   2876 
   2877    if (variant->function[RAST_EDGE_TEST]) {
   2878       variant->jit_function[RAST_EDGE_TEST] = (lp_jit_frag_func)
   2879             gallivm_jit_function(variant->gallivm,
   2880                                  variant->function[RAST_EDGE_TEST]);
   2881    }
   2882 
   2883    if (variant->function[RAST_WHOLE]) {
   2884          variant->jit_function[RAST_WHOLE] = (lp_jit_frag_func)
   2885                gallivm_jit_function(variant->gallivm,
   2886                                     variant->function[RAST_WHOLE]);
   2887    } else if (!variant->jit_function[RAST_WHOLE]) {
   2888       variant->jit_function[RAST_WHOLE] = variant->jit_function[RAST_EDGE_TEST];
   2889    }
   2890 
   2891    gallivm_free_ir(variant->gallivm);
   2892 
   2893    return variant;
   2894 }
   2895 
   2896 
   2897 static void *
   2898 llvmpipe_create_fs_state(struct pipe_context *pipe,
   2899                          const struct pipe_shader_state *templ)
   2900 {
   2901    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
   2902    struct lp_fragment_shader *shader;
   2903    int nr_samplers;
   2904    int nr_sampler_views;
   2905    int i;
   2906 
   2907    shader = CALLOC_STRUCT(lp_fragment_shader);
   2908    if (!shader)
   2909       return NULL;
   2910 
   2911    shader->no = fs_no++;
   2912    make_empty_list(&shader->variants);
   2913 
   2914    /* get/save the summary info for this shader */
   2915    lp_build_tgsi_info(templ->tokens, &shader->info);
   2916 
   2917    /* we need to keep a local copy of the tokens */
   2918    shader->base.tokens = tgsi_dup_tokens(templ->tokens);
   2919 
   2920    shader->draw_data = draw_create_fragment_shader(llvmpipe->draw, templ);
   2921    if (shader->draw_data == NULL) {
   2922       FREE((void *) shader->base.tokens);
   2923       FREE(shader);
   2924       return NULL;
   2925    }
   2926 
   2927    nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1;
   2928    nr_sampler_views = shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1;
   2929 
   2930    shader->variant_key_size = Offset(struct lp_fragment_shader_variant_key,
   2931                                      state[MAX2(nr_samplers, nr_sampler_views)]);
   2932 
   2933    for (i = 0; i < shader->info.base.num_inputs; i++) {
   2934       shader->inputs[i].usage_mask = shader->info.base.input_usage_mask[i];
   2935       shader->inputs[i].cyl_wrap = shader->info.base.input_cylindrical_wrap[i];
   2936 
   2937       switch (shader->info.base.input_interpolate[i]) {
   2938       case TGSI_INTERPOLATE_CONSTANT:
   2939          shader->inputs[i].interp = LP_INTERP_CONSTANT;
   2940          break;
   2941       case TGSI_INTERPOLATE_LINEAR:
   2942          shader->inputs[i].interp = LP_INTERP_LINEAR;
   2943          break;
   2944       case TGSI_INTERPOLATE_PERSPECTIVE:
   2945          shader->inputs[i].interp = LP_INTERP_PERSPECTIVE;
   2946          break;
   2947       case TGSI_INTERPOLATE_COLOR:
   2948          shader->inputs[i].interp = LP_INTERP_COLOR;
   2949          break;
   2950       default:
   2951          assert(0);
   2952          break;
   2953       }
   2954 
   2955       switch (shader->info.base.input_semantic_name[i]) {
   2956       case TGSI_SEMANTIC_FACE:
   2957          shader->inputs[i].interp = LP_INTERP_FACING;
   2958          break;
   2959       case TGSI_SEMANTIC_POSITION:
   2960          /* Position was already emitted above
   2961           */
   2962          shader->inputs[i].interp = LP_INTERP_POSITION;
   2963          shader->inputs[i].src_index = 0;
   2964          continue;
   2965       }
   2966 
   2967       /* XXX this is a completely pointless index map... */
   2968       shader->inputs[i].src_index = i+1;
   2969    }
   2970 
   2971    if (LP_DEBUG & DEBUG_TGSI) {
   2972       unsigned attrib;
   2973       debug_printf("llvmpipe: Create fragment shader #%u %p:\n",
   2974                    shader->no, (void *) shader);
   2975       tgsi_dump(templ->tokens, 0);
   2976       debug_printf("usage masks:\n");
   2977       for (attrib = 0; attrib < shader->info.base.num_inputs; ++attrib) {
   2978          unsigned usage_mask = shader->info.base.input_usage_mask[attrib];
   2979          debug_printf("  IN[%u].%s%s%s%s\n",
   2980                       attrib,
   2981                       usage_mask & TGSI_WRITEMASK_X ? "x" : "",
   2982                       usage_mask & TGSI_WRITEMASK_Y ? "y" : "",
   2983                       usage_mask & TGSI_WRITEMASK_Z ? "z" : "",
   2984                       usage_mask & TGSI_WRITEMASK_W ? "w" : "");
   2985       }
   2986       debug_printf("\n");
   2987    }
   2988 
   2989    return shader;
   2990 }
   2991 
   2992 
   2993 static void
   2994 llvmpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
   2995 {
   2996    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
   2997 
   2998    if (llvmpipe->fs == fs)
   2999       return;
   3000 
   3001    llvmpipe->fs = (struct lp_fragment_shader *) fs;
   3002 
   3003    draw_bind_fragment_shader(llvmpipe->draw,
   3004                              (llvmpipe->fs ? llvmpipe->fs->draw_data : NULL));
   3005 
   3006    llvmpipe->dirty |= LP_NEW_FS;
   3007 }
   3008 
   3009 
   3010 /**
   3011  * Remove shader variant from two lists: the shader's variant list
   3012  * and the context's variant list.
   3013  */
   3014 void
   3015 llvmpipe_remove_shader_variant(struct llvmpipe_context *lp,
   3016                                struct lp_fragment_shader_variant *variant)
   3017 {
   3018    if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
   3019       debug_printf("llvmpipe: del fs #%u var %u v created %u v cached %u "
   3020                    "v total cached %u inst %u total inst %u\n",
   3021                    variant->shader->no, variant->no,
   3022                    variant->shader->variants_created,
   3023                    variant->shader->variants_cached,
   3024                    lp->nr_fs_variants, variant->nr_instrs, lp->nr_fs_instrs);
   3025    }
   3026 
   3027    gallivm_destroy(variant->gallivm);
   3028 
   3029    /* remove from shader's list */
   3030    remove_from_list(&variant->list_item_local);
   3031    variant->shader->variants_cached--;
   3032 
   3033    /* remove from context's list */
   3034    remove_from_list(&variant->list_item_global);
   3035    lp->nr_fs_variants--;
   3036    lp->nr_fs_instrs -= variant->nr_instrs;
   3037 
   3038    FREE(variant);
   3039 }
   3040 
   3041 
   3042 static void
   3043 llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
   3044 {
   3045    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
   3046    struct lp_fragment_shader *shader = fs;
   3047    struct lp_fs_variant_list_item *li;
   3048 
   3049    assert(fs != llvmpipe->fs);
   3050 
   3051    /*
   3052     * XXX: we need to flush the context until we have some sort of reference
   3053     * counting in fragment shaders as they may still be binned
   3054     * Flushing alone might not sufficient we need to wait on it too.
   3055     */
   3056    llvmpipe_finish(pipe, __FUNCTION__);
   3057 
   3058    /* Delete all the variants */
   3059    li = first_elem(&shader->variants);
   3060    while(!at_end(&shader->variants, li)) {
   3061       struct lp_fs_variant_list_item *next = next_elem(li);
   3062       llvmpipe_remove_shader_variant(llvmpipe, li->base);
   3063       li = next;
   3064    }
   3065 
   3066    /* Delete draw module's data */
   3067    draw_delete_fragment_shader(llvmpipe->draw, shader->draw_data);
   3068 
   3069    assert(shader->variants_cached == 0);
   3070    FREE((void *) shader->base.tokens);
   3071    FREE(shader);
   3072 }
   3073 
   3074 
   3075 
   3076 static void
   3077 llvmpipe_set_constant_buffer(struct pipe_context *pipe,
   3078                              enum pipe_shader_type shader, uint index,
   3079                              const struct pipe_constant_buffer *cb)
   3080 {
   3081    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
   3082    struct pipe_resource *constants = cb ? cb->buffer : NULL;
   3083 
   3084    assert(shader < PIPE_SHADER_TYPES);
   3085    assert(index < ARRAY_SIZE(llvmpipe->constants[shader]));
   3086 
   3087    /* note: reference counting */
   3088    util_copy_constant_buffer(&llvmpipe->constants[shader][index], cb);
   3089 
   3090    if (constants) {
   3091        if (!(constants->bind & PIPE_BIND_CONSTANT_BUFFER)) {
   3092          debug_printf("Illegal set constant without bind flag\n");
   3093          constants->bind |= PIPE_BIND_CONSTANT_BUFFER;
   3094       }
   3095    }
   3096 
   3097    if (shader == PIPE_SHADER_VERTEX ||
   3098        shader == PIPE_SHADER_GEOMETRY) {
   3099       /* Pass the constants to the 'draw' module */
   3100       const unsigned size = cb ? cb->buffer_size : 0;
   3101       const ubyte *data;
   3102 
   3103       if (constants) {
   3104          data = (ubyte *) llvmpipe_resource_data(constants);
   3105       }
   3106       else if (cb && cb->user_buffer) {
   3107          data = (ubyte *) cb->user_buffer;
   3108       }
   3109       else {
   3110          data = NULL;
   3111       }
   3112 
   3113       if (data)
   3114          data += cb->buffer_offset;
   3115 
   3116       draw_set_mapped_constant_buffer(llvmpipe->draw, shader,
   3117                                       index, data, size);
   3118    }
   3119    else {
   3120       llvmpipe->dirty |= LP_NEW_FS_CONSTANTS;
   3121    }
   3122 
   3123    if (cb && cb->user_buffer) {
   3124       pipe_resource_reference(&constants, NULL);
   3125    }
   3126 }
   3127 
   3128 
   3129 /**
   3130  * Return the blend factor equivalent to a destination alpha of one.
   3131  */
   3132 static inline unsigned
   3133 force_dst_alpha_one(unsigned factor, boolean clamped_zero)
   3134 {
   3135    switch(factor) {
   3136    case PIPE_BLENDFACTOR_DST_ALPHA:
   3137       return PIPE_BLENDFACTOR_ONE;
   3138    case PIPE_BLENDFACTOR_INV_DST_ALPHA:
   3139       return PIPE_BLENDFACTOR_ZERO;
   3140    case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
   3141       if (clamped_zero)
   3142          return PIPE_BLENDFACTOR_ZERO;
   3143       else
   3144          return PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE;
   3145    }
   3146 
   3147    return factor;
   3148 }
   3149 
   3150 
   3151 /**
   3152  * We need to generate several variants of the fragment pipeline to match
   3153  * all the combinations of the contributing state atoms.
   3154  *
   3155  * TODO: there is actually no reason to tie this to context state -- the
   3156  * generated code could be cached globally in the screen.
   3157  */
   3158 static void
   3159 make_variant_key(struct llvmpipe_context *lp,
   3160                  struct lp_fragment_shader *shader,
   3161                  struct lp_fragment_shader_variant_key *key)
   3162 {
   3163    unsigned i;
   3164 
   3165    memset(key, 0, shader->variant_key_size);
   3166 
   3167    if (lp->framebuffer.zsbuf) {
   3168       enum pipe_format zsbuf_format = lp->framebuffer.zsbuf->format;
   3169       const struct util_format_description *zsbuf_desc =
   3170          util_format_description(zsbuf_format);
   3171 
   3172       if (lp->depth_stencil->depth.enabled &&
   3173           util_format_has_depth(zsbuf_desc)) {
   3174          key->zsbuf_format = zsbuf_format;
   3175          memcpy(&key->depth, &lp->depth_stencil->depth, sizeof key->depth);
   3176       }
   3177       if (lp->depth_stencil->stencil[0].enabled &&
   3178           util_format_has_stencil(zsbuf_desc)) {
   3179          key->zsbuf_format = zsbuf_format;
   3180          memcpy(&key->stencil, &lp->depth_stencil->stencil, sizeof key->stencil);
   3181       }
   3182       if (llvmpipe_resource_is_1d(lp->framebuffer.zsbuf->texture)) {
   3183          key->resource_1d = TRUE;
   3184       }
   3185    }
   3186 
   3187    /*
   3188     * Propagate the depth clamp setting from the rasterizer state.
   3189     * depth_clip == 0 implies depth clamping is enabled.
   3190     *
   3191     * When clip_halfz is enabled, then always clamp the depth values.
   3192     *
   3193     * XXX: This is incorrect for GL, but correct for d3d10 (depth
   3194     * clamp is always active in d3d10, regardless if depth clip is
   3195     * enabled or not).
   3196     * (GL has an always-on [0,1] clamp on fs depth output instead
   3197     * to ensure the depth values stay in range. Doesn't look like
   3198     * we do that, though...)
   3199     */
   3200    if (lp->rasterizer->clip_halfz) {
   3201       key->depth_clamp = 1;
   3202    } else {
   3203       key->depth_clamp = (lp->rasterizer->depth_clip == 0) ? 1 : 0;
   3204    }
   3205 
   3206    /* alpha test only applies if render buffer 0 is non-integer (or does not exist) */
   3207    if (!lp->framebuffer.nr_cbufs ||
   3208        !lp->framebuffer.cbufs[0] ||
   3209        !util_format_is_pure_integer(lp->framebuffer.cbufs[0]->format)) {
   3210       key->alpha.enabled = lp->depth_stencil->alpha.enabled;
   3211    }
   3212    if(key->alpha.enabled)
   3213       key->alpha.func = lp->depth_stencil->alpha.func;
   3214    /* alpha.ref_value is passed in jit_context */
   3215 
   3216    key->flatshade = lp->rasterizer->flatshade;
   3217    if (lp->active_occlusion_queries) {
   3218       key->occlusion_count = TRUE;
   3219    }
   3220 
   3221    if (lp->framebuffer.nr_cbufs) {
   3222       memcpy(&key->blend, lp->blend, sizeof key->blend);
   3223    }
   3224 
   3225    key->nr_cbufs = lp->framebuffer.nr_cbufs;
   3226 
   3227    if (!key->blend.independent_blend_enable) {
   3228       /* we always need independent blend otherwise the fixups below won't work */
   3229       for (i = 1; i < key->nr_cbufs; i++) {
   3230          memcpy(&key->blend.rt[i], &key->blend.rt[0], sizeof(key->blend.rt[0]));
   3231       }
   3232       key->blend.independent_blend_enable = 1;
   3233    }
   3234 
   3235    for (i = 0; i < lp->framebuffer.nr_cbufs; i++) {
   3236       struct pipe_rt_blend_state *blend_rt = &key->blend.rt[i];
   3237 
   3238       if (lp->framebuffer.cbufs[i]) {
   3239          enum pipe_format format = lp->framebuffer.cbufs[i]->format;
   3240          const struct util_format_description *format_desc;
   3241 
   3242          key->cbuf_format[i] = format;
   3243 
   3244          /*
   3245           * Figure out if this is a 1d resource. Note that OpenGL allows crazy
   3246           * mixing of 2d textures with height 1 and 1d textures, so make sure
   3247           * we pick 1d if any cbuf or zsbuf is 1d.
   3248           */
   3249          if (llvmpipe_resource_is_1d(lp->framebuffer.cbufs[i]->texture)) {
   3250             key->resource_1d = TRUE;
   3251          }
   3252 
   3253          format_desc = util_format_description(format);
   3254          assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
   3255                 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB);
   3256 
   3257          /*
   3258           * Mask out color channels not present in the color buffer.
   3259           */
   3260          blend_rt->colormask &= util_format_colormask(format_desc);
   3261 
   3262          /*
   3263           * Disable blend for integer formats.
   3264           */
   3265          if (util_format_is_pure_integer(format)) {
   3266             blend_rt->blend_enable = 0;
   3267          }
   3268 
   3269          /*
   3270           * Our swizzled render tiles always have an alpha channel, but the
   3271           * linear render target format often does not, so force here the dst
   3272           * alpha to be one.
   3273           *
   3274           * This is not a mere optimization. Wrong results will be produced if
   3275           * the dst alpha is used, the dst format does not have alpha, and the
   3276           * previous rendering was not flushed from the swizzled to linear
   3277           * buffer. For example, NonPowTwo DCT.
   3278           *
   3279           * TODO: This should be generalized to all channels for better
   3280           * performance, but only alpha causes correctness issues.
   3281           *
   3282           * Also, force rgb/alpha func/factors match, to make AoS blending
   3283           * easier.
   3284           */
   3285          if (format_desc->swizzle[3] > PIPE_SWIZZLE_W ||
   3286              format_desc->swizzle[3] == format_desc->swizzle[0]) {
   3287             /* Doesn't cover mixed snorm/unorm but can't render to them anyway */
   3288             boolean clamped_zero = !util_format_is_float(format) &&
   3289                                    !util_format_is_snorm(format);
   3290             blend_rt->rgb_src_factor =
   3291                force_dst_alpha_one(blend_rt->rgb_src_factor, clamped_zero);
   3292             blend_rt->rgb_dst_factor =
   3293                force_dst_alpha_one(blend_rt->rgb_dst_factor, clamped_zero);
   3294             blend_rt->alpha_func       = blend_rt->rgb_func;
   3295             blend_rt->alpha_src_factor = blend_rt->rgb_src_factor;
   3296             blend_rt->alpha_dst_factor = blend_rt->rgb_dst_factor;
   3297          }
   3298       }
   3299       else {
   3300          /* no color buffer for this fragment output */
   3301          key->cbuf_format[i] = PIPE_FORMAT_NONE;
   3302          blend_rt->colormask = 0x0;
   3303          blend_rt->blend_enable = 0;
   3304       }
   3305    }
   3306 
   3307    /* This value will be the same for all the variants of a given shader:
   3308     */
   3309    key->nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1;
   3310 
   3311    for(i = 0; i < key->nr_samplers; ++i) {
   3312       if(shader->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
   3313          lp_sampler_static_sampler_state(&key->state[i].sampler_state,
   3314                                          lp->samplers[PIPE_SHADER_FRAGMENT][i]);
   3315       }
   3316    }
   3317 
   3318    /*
   3319     * XXX If TGSI_FILE_SAMPLER_VIEW exists assume all texture opcodes
   3320     * are dx10-style? Can't really have mixed opcodes, at least not
   3321     * if we want to skip the holes here (without rescanning tgsi).
   3322     */
   3323    if (shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] != -1) {
   3324       key->nr_sampler_views = shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1;
   3325       for(i = 0; i < key->nr_sampler_views; ++i) {
   3326          if(shader->info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1 << i)) {
   3327             lp_sampler_static_texture_state(&key->state[i].texture_state,
   3328                                             lp->sampler_views[PIPE_SHADER_FRAGMENT][i]);
   3329          }
   3330       }
   3331    }
   3332    else {
   3333       key->nr_sampler_views = key->nr_samplers;
   3334       for(i = 0; i < key->nr_sampler_views; ++i) {
   3335          if(shader->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
   3336             lp_sampler_static_texture_state(&key->state[i].texture_state,
   3337                                             lp->sampler_views[PIPE_SHADER_FRAGMENT][i]);
   3338          }
   3339       }
   3340    }
   3341 }
   3342 
   3343 
   3344 
   3345 /**
   3346  * Update fragment shader state.  This is called just prior to drawing
   3347  * something when some fragment-related state has changed.
   3348  */
   3349 void
   3350 llvmpipe_update_fs(struct llvmpipe_context *lp)
   3351 {
   3352    struct lp_fragment_shader *shader = lp->fs;
   3353    struct lp_fragment_shader_variant_key key;
   3354    struct lp_fragment_shader_variant *variant = NULL;
   3355    struct lp_fs_variant_list_item *li;
   3356 
   3357    make_variant_key(lp, shader, &key);
   3358 
   3359    /* Search the variants for one which matches the key */
   3360    li = first_elem(&shader->variants);
   3361    while(!at_end(&shader->variants, li)) {
   3362       if(memcmp(&li->base->key, &key, shader->variant_key_size) == 0) {
   3363          variant = li->base;
   3364          break;
   3365       }
   3366       li = next_elem(li);
   3367    }
   3368 
   3369    if (variant) {
   3370       /* Move this variant to the head of the list to implement LRU
   3371        * deletion of shader's when we have too many.
   3372        */
   3373       move_to_head(&lp->fs_variants_list, &variant->list_item_global);
   3374    }
   3375    else {
   3376       /* variant not found, create it now */
   3377       int64_t t0, t1, dt;
   3378       unsigned i;
   3379       unsigned variants_to_cull;
   3380 
   3381       if (LP_DEBUG & DEBUG_FS) {
   3382          debug_printf("%u variants,\t%u instrs,\t%u instrs/variant\n",
   3383                       lp->nr_fs_variants,
   3384                       lp->nr_fs_instrs,
   3385                       lp->nr_fs_variants ? lp->nr_fs_instrs / lp->nr_fs_variants : 0);
   3386       }
   3387 
   3388       /* First, check if we've exceeded the max number of shader variants.
   3389        * If so, free 6.25% of them (the least recently used ones).
   3390        */
   3391       variants_to_cull = lp->nr_fs_variants >= LP_MAX_SHADER_VARIANTS ? LP_MAX_SHADER_VARIANTS / 16 : 0;
   3392 
   3393       if (variants_to_cull ||
   3394           lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS) {
   3395          struct pipe_context *pipe = &lp->pipe;
   3396 
   3397          if (gallivm_debug & GALLIVM_DEBUG_PERF) {
   3398             debug_printf("Evicting FS: %u fs variants,\t%u total variants,"
   3399                          "\t%u instrs,\t%u instrs/variant\n",
   3400                          shader->variants_cached,
   3401                          lp->nr_fs_variants, lp->nr_fs_instrs,
   3402                          lp->nr_fs_instrs / lp->nr_fs_variants);
   3403          }
   3404 
   3405          /*
   3406           * XXX: we need to flush the context until we have some sort of
   3407           * reference counting in fragment shaders as they may still be binned
   3408           * Flushing alone might not be sufficient we need to wait on it too.
   3409           */
   3410          llvmpipe_finish(pipe, __FUNCTION__);
   3411 
   3412          /*
   3413           * We need to re-check lp->nr_fs_variants because an arbitrarliy large
   3414           * number of shader variants (potentially all of them) could be
   3415           * pending for destruction on flush.
   3416           */
   3417 
   3418          for (i = 0; i < variants_to_cull || lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS; i++) {
   3419             struct lp_fs_variant_list_item *item;
   3420             if (is_empty_list(&lp->fs_variants_list)) {
   3421                break;
   3422             }
   3423             item = last_elem(&lp->fs_variants_list);
   3424             assert(item);
   3425             assert(item->base);
   3426             llvmpipe_remove_shader_variant(lp, item->base);
   3427          }
   3428       }
   3429 
   3430       /*
   3431        * Generate the new variant.
   3432        */
   3433       t0 = os_time_get();
   3434       variant = generate_variant(lp, shader, &key);
   3435       t1 = os_time_get();
   3436       dt = t1 - t0;
   3437       LP_COUNT_ADD(llvm_compile_time, dt);
   3438       LP_COUNT_ADD(nr_llvm_compiles, 2);  /* emit vs. omit in/out test */
   3439 
   3440       /* Put the new variant into the list */
   3441       if (variant) {
   3442          insert_at_head(&shader->variants, &variant->list_item_local);
   3443          insert_at_head(&lp->fs_variants_list, &variant->list_item_global);
   3444          lp->nr_fs_variants++;
   3445          lp->nr_fs_instrs += variant->nr_instrs;
   3446          shader->variants_cached++;
   3447       }
   3448    }
   3449 
   3450    /* Bind this variant */
   3451    lp_setup_set_fs_variant(lp->setup, variant);
   3452 }
   3453 
   3454 
   3455 
   3456 
   3457 
   3458 void
   3459 llvmpipe_init_fs_funcs(struct llvmpipe_context *llvmpipe)
   3460 {
   3461    llvmpipe->pipe.create_fs_state = llvmpipe_create_fs_state;
   3462    llvmpipe->pipe.bind_fs_state   = llvmpipe_bind_fs_state;
   3463    llvmpipe->pipe.delete_fs_state = llvmpipe_delete_fs_state;
   3464 
   3465    llvmpipe->pipe.set_constant_buffer = llvmpipe_set_constant_buffer;
   3466 }
   3467 
   3468 /*
   3469  * Rasterization is disabled if there is no pixel shader and
   3470  * both depth and stencil testing are disabled:
   3471  * http://msdn.microsoft.com/en-us/library/windows/desktop/bb205125
   3472  */
   3473 boolean
   3474 llvmpipe_rasterization_disabled(struct llvmpipe_context *lp)
   3475 {
   3476    boolean null_fs = !lp->fs || lp->fs->info.base.num_tokens <= 1;
   3477 
   3478    return (null_fs &&
   3479            !lp->depth_stencil->depth.enabled &&
   3480            !lp->depth_stencil->stencil[0].enabled);
   3481 }
   3482