Home | History | Annotate | Download | only in llvmpipe
      1 /**************************************************************************
      2  *
      3  * Copyright 2009 VMware, Inc.
      4  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
      5  * All Rights Reserved.
      6  *
      7  * Permission is hereby granted, free of charge, to any person obtaining a
      8  * copy of this software and associated documentation files (the
      9  * "Software"), to deal in the Software without restriction, including
     10  * without limitation the rights to use, copy, modify, merge, publish,
     11  * distribute, sub license, and/or sell copies of the Software, and to
     12  * permit persons to whom the Software is furnished to do so, subject to
     13  * the following conditions:
     14  *
     15  * The above copyright notice and this permission notice (including the
     16  * next paragraph) shall be included in all copies or substantial portions
     17  * of the Software.
     18  *
     19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
     22  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
     23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     26  *
     27  **************************************************************************/
     28 
     29 /**
     30  * @file
     31  * Position and shader input interpolation.
     32  *
     33  * @author Jose Fonseca <jfonseca (at) vmware.com>
     34  */
     35 
     36 #include "pipe/p_shader_tokens.h"
     37 #include "util/u_debug.h"
     38 #include "util/u_memory.h"
     39 #include "util/u_math.h"
     40 #include "tgsi/tgsi_scan.h"
     41 #include "gallivm/lp_bld_debug.h"
     42 #include "gallivm/lp_bld_const.h"
     43 #include "gallivm/lp_bld_arit.h"
     44 #include "gallivm/lp_bld_swizzle.h"
     45 #include "gallivm/lp_bld_flow.h"
     46 #include "lp_bld_interp.h"
     47 
     48 
     49 /*
     50  * The shader JIT function operates on blocks of quads.
     51  * Each block has 2x2 quads and each quad has 2x2 pixels.
     52  *
     53  * We iterate over the quads in order 0, 1, 2, 3:
     54  *
     55  * #################
     56  * #   |   #   |   #
     57  * #---0---#---1---#
     58  * #   |   #   |   #
     59  * #################
     60  * #   |   #   |   #
     61  * #---2---#---3---#
     62  * #   |   #   |   #
     63  * #################
     64  *
     65  * If we iterate over multiple quads at once, quads 01 and 23 are processed
     66  * together.
     67  *
     68  * Within each quad, we have four pixels which are represented in SOA
     69  * order:
     70  *
     71  * #########
     72  * # 0 | 1 #
     73  * #---+---#
     74  * # 2 | 3 #
     75  * #########
     76  *
     77  * So the green channel (for example) of the four pixels is stored in
     78  * a single vector register: {g0, g1, g2, g3}.
     79  * The order stays the same even with multiple quads:
     80  * 0 1 4 5
     81  * 2 3 6 7
     82  * is stored as g0..g7
     83  */
     84 
     85 
     86 /**
     87  * Do one perspective divide per quad.
     88  *
     89  * For perspective interpolation, the final attribute value is given
     90  *
     91  *  a' = a/w = a * oow
     92  *
     93  * where
     94  *
     95  *  a = a0 + dadx*x + dady*y
     96  *  w = w0 + dwdx*x + dwdy*y
     97  *  oow = 1/w = 1/(w0 + dwdx*x + dwdy*y)
     98  *
     99  * Instead of computing the division per pixel, with this macro we compute the
    100  * division on the upper left pixel of each quad, and use a linear
    101  * approximation in the remaining pixels, given by:
    102  *
    103  *  da'dx = (dadx - dwdx*a)*oow
    104  *  da'dy = (dady - dwdy*a)*oow
    105  *
    106  * Ironically, this actually makes things slower -- probably because the
    107  * divide hardware unit is rarely used, whereas the multiply unit is typically
    108  * already saturated.
    109  */
    110 #define PERSPECTIVE_DIVIDE_PER_QUAD 0
    111 
    112 
    113 static const unsigned char quad_offset_x[16] = {0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3};
    114 static const unsigned char quad_offset_y[16] = {0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3};
    115 
    116 
    117 static void
    118 attrib_name(LLVMValueRef val, unsigned attrib, unsigned chan, const char *suffix)
    119 {
    120    if(attrib == 0)
    121       lp_build_name(val, "pos.%c%s", "xyzw"[chan], suffix);
    122    else
    123       lp_build_name(val, "input%u.%c%s", attrib - 1, "xyzw"[chan], suffix);
    124 }
    125 
    126 static void
    127 calc_offsets(struct lp_build_context *coeff_bld,
    128              unsigned quad_start_index,
    129              LLVMValueRef *pixoffx,
    130              LLVMValueRef *pixoffy)
    131 {
    132    unsigned i;
    133    unsigned num_pix = coeff_bld->type.length;
    134    struct gallivm_state *gallivm = coeff_bld->gallivm;
    135    LLVMBuilderRef builder = coeff_bld->gallivm->builder;
    136    LLVMValueRef nr, pixxf, pixyf;
    137 
    138    *pixoffx = coeff_bld->undef;
    139    *pixoffy = coeff_bld->undef;
    140 
    141    for (i = 0; i < num_pix; i++) {
    142       nr = lp_build_const_int32(gallivm, i);
    143       pixxf = lp_build_const_float(gallivm, quad_offset_x[i % num_pix] +
    144                                    (quad_start_index & 1) * 2);
    145       pixyf = lp_build_const_float(gallivm, quad_offset_y[i % num_pix] +
    146                                    (quad_start_index & 2));
    147       *pixoffx = LLVMBuildInsertElement(builder, *pixoffx, pixxf, nr, "");
    148       *pixoffy = LLVMBuildInsertElement(builder, *pixoffy, pixyf, nr, "");
    149    }
    150 }
    151 
    152 
    153 /* Much easier, and significantly less instructions in the per-stamp
    154  * part (less than half) but overall more instructions so a loss if
    155  * most quads are active. Might be a win though with larger vectors.
    156  * No ability to do per-quad divide (doable but not implemented)
    157  * Could be made to work with passed in pixel offsets (i.e. active quad merging).
    158  */
    159 static void
    160 coeffs_init_simple(struct lp_build_interp_soa_context *bld,
    161                    LLVMValueRef a0_ptr,
    162                    LLVMValueRef dadx_ptr,
    163                    LLVMValueRef dady_ptr)
    164 {
    165    struct lp_build_context *coeff_bld = &bld->coeff_bld;
    166    struct lp_build_context *setup_bld = &bld->setup_bld;
    167    struct gallivm_state *gallivm = coeff_bld->gallivm;
    168    LLVMBuilderRef builder = gallivm->builder;
    169    unsigned attrib;
    170 
    171    for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
    172       /*
    173        * always fetch all 4 values for performance/simplicity
    174        * Note: we do that here because it seems to generate better
    175        * code. It generates a lot of moves initially but less
    176        * moves later. As far as I can tell this looks like a
    177        * llvm issue, instead of simply reloading the values from
    178        * the passed in pointers it if it runs out of registers
    179        * it spills/reloads them. Maybe some optimization passes
    180        * would help.
    181        * Might want to investigate this again later.
    182        */
    183       const unsigned interp = bld->interp[attrib];
    184       LLVMValueRef index = lp_build_const_int32(gallivm,
    185                                 attrib * TGSI_NUM_CHANNELS);
    186       LLVMValueRef ptr;
    187       LLVMValueRef dadxaos = setup_bld->zero;
    188       LLVMValueRef dadyaos = setup_bld->zero;
    189       LLVMValueRef a0aos = setup_bld->zero;
    190 
    191       switch (interp) {
    192       case LP_INTERP_PERSPECTIVE:
    193          /* fall-through */
    194 
    195       case LP_INTERP_LINEAR:
    196          ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, "");
    197          ptr = LLVMBuildBitCast(builder, ptr,
    198                LLVMPointerType(setup_bld->vec_type, 0), "");
    199          dadxaos = LLVMBuildLoad(builder, ptr, "");
    200 
    201          ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, "");
    202          ptr = LLVMBuildBitCast(builder, ptr,
    203                LLVMPointerType(setup_bld->vec_type, 0), "");
    204          dadyaos = LLVMBuildLoad(builder, ptr, "");
    205 
    206          attrib_name(dadxaos, attrib, 0, ".dadxaos");
    207          attrib_name(dadyaos, attrib, 0, ".dadyaos");
    208          /* fall-through */
    209 
    210       case LP_INTERP_CONSTANT:
    211       case LP_INTERP_FACING:
    212          ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, "");
    213          ptr = LLVMBuildBitCast(builder, ptr,
    214                LLVMPointerType(setup_bld->vec_type, 0), "");
    215          a0aos = LLVMBuildLoad(builder, ptr, "");
    216          attrib_name(a0aos, attrib, 0, ".a0aos");
    217          break;
    218 
    219       case LP_INTERP_POSITION:
    220          /* Nothing to do as the position coeffs are already setup in slot 0 */
    221          continue;
    222 
    223       default:
    224          assert(0);
    225          break;
    226       }
    227       bld->a0aos[attrib] = a0aos;
    228       bld->dadxaos[attrib] = dadxaos;
    229       bld->dadyaos[attrib] = dadyaos;
    230    }
    231 }
    232 
    233 /**
    234  * Interpolate the shader input attribute values.
    235  * This is called for each (group of) quad(s).
    236  */
    237 static void
    238 attribs_update_simple(struct lp_build_interp_soa_context *bld,
    239                       struct gallivm_state *gallivm,
    240                       int quad_start_index,
    241                       LLVMValueRef loop_iter,
    242                       int start,
    243                       int end)
    244 {
    245    LLVMBuilderRef builder = gallivm->builder;
    246    struct lp_build_context *coeff_bld = &bld->coeff_bld;
    247    struct lp_build_context *setup_bld = &bld->setup_bld;
    248    LLVMValueRef oow = NULL;
    249    unsigned attrib;
    250    LLVMValueRef pixoffx;
    251    LLVMValueRef pixoffy;
    252 
    253    /* could do this with code-generated passed in pixel offsets too */
    254    if (bld->dynamic_offsets) {
    255       LLVMValueRef ptr;
    256 
    257       assert(loop_iter);
    258       ptr = LLVMBuildGEP(builder, bld->xoffset_store, &loop_iter, 1, "");
    259       pixoffx = LLVMBuildLoad(builder, ptr, "");
    260       ptr = LLVMBuildGEP(builder, bld->yoffset_store, &loop_iter, 1, "");
    261       pixoffy = LLVMBuildLoad(builder, ptr, "");
    262    }
    263    else {
    264       calc_offsets(coeff_bld, quad_start_index, &pixoffx, &pixoffy);
    265    }
    266 
    267    pixoffx = LLVMBuildFAdd(builder, pixoffx,
    268                            lp_build_broadcast_scalar(coeff_bld, bld->x), "");
    269    pixoffy = LLVMBuildFAdd(builder, pixoffy,
    270                            lp_build_broadcast_scalar(coeff_bld, bld->y), "");
    271 
    272    for (attrib = start; attrib < end; attrib++) {
    273       const unsigned mask = bld->mask[attrib];
    274       const unsigned interp = bld->interp[attrib];
    275       unsigned chan;
    276 
    277       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
    278          if (mask & (1 << chan)) {
    279             LLVMValueRef index;
    280             LLVMValueRef dadx = coeff_bld->zero;
    281             LLVMValueRef dady = coeff_bld->zero;
    282             LLVMValueRef a = coeff_bld->zero;
    283 
    284             index = lp_build_const_int32(gallivm, chan);
    285             switch (interp) {
    286             case LP_INTERP_PERSPECTIVE:
    287                /* fall-through */
    288 
    289             case LP_INTERP_LINEAR:
    290                if (attrib == 0 && chan == 0) {
    291                   dadx = coeff_bld->one;
    292                }
    293                else if (attrib == 0 && chan == 1) {
    294                   dady = coeff_bld->one;
    295                }
    296                else {
    297                   dadx = lp_build_extract_broadcast(gallivm, setup_bld->type,
    298                                                     coeff_bld->type, bld->dadxaos[attrib],
    299                                                     index);
    300                   dady = lp_build_extract_broadcast(gallivm, setup_bld->type,
    301                                                     coeff_bld->type, bld->dadyaos[attrib],
    302                                                     index);
    303                   a = lp_build_extract_broadcast(gallivm, setup_bld->type,
    304                                                  coeff_bld->type, bld->a0aos[attrib],
    305                                                  index);
    306                }
    307                /*
    308                 * a = a0 + (x * dadx + y * dady)
    309                 */
    310                dadx = LLVMBuildFMul(builder, dadx, pixoffx, "");
    311                dady = LLVMBuildFMul(builder, dady, pixoffy, "");
    312                a = LLVMBuildFAdd(builder, a, dadx, "");
    313                a = LLVMBuildFAdd(builder, a, dady, "");
    314 
    315                if (interp == LP_INTERP_PERSPECTIVE) {
    316                   if (oow == NULL) {
    317                      LLVMValueRef w = bld->attribs[0][3];
    318                      assert(attrib != 0);
    319                      assert(bld->mask[0] & TGSI_WRITEMASK_W);
    320                      oow = lp_build_rcp(coeff_bld, w);
    321                   }
    322                   a = lp_build_mul(coeff_bld, a, oow);
    323                }
    324                break;
    325 
    326             case LP_INTERP_CONSTANT:
    327             case LP_INTERP_FACING:
    328                a = lp_build_extract_broadcast(gallivm, setup_bld->type,
    329                                               coeff_bld->type, bld->a0aos[attrib],
    330                                               index);
    331                break;
    332 
    333             case LP_INTERP_POSITION:
    334                assert(attrib > 0);
    335                a = bld->attribs[0][chan];
    336                break;
    337 
    338             default:
    339                assert(0);
    340                break;
    341             }
    342 
    343             if ((attrib == 0) && (chan == 2)){
    344                /* FIXME: Depth values can exceed 1.0, due to the fact that
    345                 * setup interpolation coefficients refer to (0,0) which causes
    346                 * precision loss. So we must clamp to 1.0 here to avoid artifacts
    347                 */
    348                a = lp_build_min(coeff_bld, a, coeff_bld->one);
    349             }
    350             bld->attribs[attrib][chan] = a;
    351          }
    352       }
    353    }
    354 }
    355 
    356 /**
    357  * Initialize the bld->a, dadq fields.  This involves fetching
    358  * those values from the arrays which are passed into the JIT function.
    359  */
    360 static void
    361 coeffs_init(struct lp_build_interp_soa_context *bld,
    362             LLVMValueRef a0_ptr,
    363             LLVMValueRef dadx_ptr,
    364             LLVMValueRef dady_ptr)
    365 {
    366    struct lp_build_context *coeff_bld = &bld->coeff_bld;
    367    struct lp_build_context *setup_bld = &bld->setup_bld;
    368    struct gallivm_state *gallivm = coeff_bld->gallivm;
    369    LLVMBuilderRef builder = gallivm->builder;
    370    LLVMValueRef pixoffx, pixoffy;
    371    unsigned attrib;
    372    unsigned chan;
    373    unsigned i;
    374 
    375    pixoffx = coeff_bld->undef;
    376    pixoffy = coeff_bld->undef;
    377    for (i = 0; i < coeff_bld->type.length; i++) {
    378       LLVMValueRef nr = lp_build_const_int32(gallivm, i);
    379       LLVMValueRef pixxf = lp_build_const_float(gallivm, quad_offset_x[i]);
    380       LLVMValueRef pixyf = lp_build_const_float(gallivm, quad_offset_y[i]);
    381       pixoffx = LLVMBuildInsertElement(builder, pixoffx, pixxf, nr, "");
    382       pixoffy = LLVMBuildInsertElement(builder, pixoffy, pixyf, nr, "");
    383    }
    384 
    385 
    386    for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
    387       const unsigned mask = bld->mask[attrib];
    388       const unsigned interp = bld->interp[attrib];
    389       LLVMValueRef index = lp_build_const_int32(gallivm,
    390                                 attrib * TGSI_NUM_CHANNELS);
    391       LLVMValueRef ptr;
    392       LLVMValueRef dadxaos = setup_bld->zero;
    393       LLVMValueRef dadyaos = setup_bld->zero;
    394       LLVMValueRef a0aos = setup_bld->zero;
    395 
    396       /* always fetch all 4 values for performance/simplicity */
    397       switch (interp) {
    398       case LP_INTERP_PERSPECTIVE:
    399          /* fall-through */
    400 
    401       case LP_INTERP_LINEAR:
    402          ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, "");
    403          ptr = LLVMBuildBitCast(builder, ptr,
    404                LLVMPointerType(setup_bld->vec_type, 0), "");
    405          dadxaos = LLVMBuildLoad(builder, ptr, "");
    406 
    407          ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, "");
    408          ptr = LLVMBuildBitCast(builder, ptr,
    409                LLVMPointerType(setup_bld->vec_type, 0), "");
    410          dadyaos = LLVMBuildLoad(builder, ptr, "");
    411 
    412          attrib_name(dadxaos, attrib, 0, ".dadxaos");
    413          attrib_name(dadyaos, attrib, 0, ".dadyaos");
    414          /* fall-through */
    415 
    416       case LP_INTERP_CONSTANT:
    417       case LP_INTERP_FACING:
    418          ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, "");
    419          ptr = LLVMBuildBitCast(builder, ptr,
    420                LLVMPointerType(setup_bld->vec_type, 0), "");
    421          a0aos = LLVMBuildLoad(builder, ptr, "");
    422          attrib_name(a0aos, attrib, 0, ".a0aos");
    423          break;
    424 
    425       case LP_INTERP_POSITION:
    426          /* Nothing to do as the position coeffs are already setup in slot 0 */
    427          continue;
    428 
    429       default:
    430          assert(0);
    431          break;
    432       }
    433 
    434       /*
    435        * a = a0 + (x * dadx + y * dady)
    436        * a0aos is the attrib value at top left corner of stamp
    437        */
    438       if (interp != LP_INTERP_CONSTANT &&
    439           interp != LP_INTERP_FACING) {
    440          LLVMValueRef axaos, ayaos;
    441          axaos = LLVMBuildFMul(builder, lp_build_broadcast_scalar(setup_bld, bld->x),
    442                                dadxaos, "");
    443          ayaos = LLVMBuildFMul(builder, lp_build_broadcast_scalar(setup_bld, bld->y),
    444                                dadyaos, "");
    445          a0aos = LLVMBuildFAdd(builder, a0aos, ayaos, "");
    446          a0aos = LLVMBuildFAdd(builder, a0aos, axaos, "");
    447       }
    448 
    449       /*
    450        * dadq = {0, dadx, dady, dadx + dady}
    451        * for two quads (side by side) this is:
    452        * {0, dadx, dady, dadx+dady, 2*dadx, 2*dadx+dady, 3*dadx+dady}
    453        */
    454       for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
    455          /* this generates a CRAPLOAD of shuffles... */
    456          if (mask & (1 << chan)) {
    457             LLVMValueRef dadx, dady;
    458             LLVMValueRef dadq, dadq2;
    459             LLVMValueRef a;
    460             LLVMValueRef chan_index = lp_build_const_int32(gallivm, chan);
    461 
    462             if (attrib == 0 && chan == 0) {
    463                a = lp_build_broadcast_scalar(coeff_bld, bld->x);
    464                dadx = coeff_bld->one;
    465                dady = coeff_bld->zero;
    466             }
    467             else if (attrib == 0 && chan == 1) {
    468                a = lp_build_broadcast_scalar(coeff_bld, bld->y);
    469                dady = coeff_bld->one;
    470                dadx = coeff_bld->zero;
    471             }
    472             else {
    473                dadx = lp_build_extract_broadcast(gallivm, setup_bld->type,
    474                                               coeff_bld->type, dadxaos, chan_index);
    475                dady = lp_build_extract_broadcast(gallivm, setup_bld->type,
    476                                               coeff_bld->type, dadyaos, chan_index);
    477 
    478                /*
    479                 * a = {a, a, a, a}
    480                 */
    481                a = lp_build_extract_broadcast(gallivm, setup_bld->type,
    482                                               coeff_bld->type, a0aos, chan_index);
    483             }
    484 
    485             dadx = LLVMBuildFMul(builder, dadx, pixoffx, "");
    486             dady = LLVMBuildFMul(builder, dady, pixoffy, "");
    487             dadq = LLVMBuildFAdd(builder, dadx, dady, "");
    488 
    489             /*
    490              * Compute the attrib values on the upper-left corner of each
    491              * group of quads.
    492              * Note that if we process 2 quads at once this doesn't
    493              * really exactly to what we want.
    494              * We need to access elem 0 and 2 respectively later if we process
    495              * 2 quads at once.
    496              */
    497 
    498             if (interp != LP_INTERP_CONSTANT &&
    499                 interp != LP_INTERP_FACING) {
    500                dadq2 = LLVMBuildFAdd(builder, dadq, dadq, "");
    501                a = LLVMBuildFAdd(builder, a, dadq2, "");
    502 	    }
    503 
    504 #if PERSPECTIVE_DIVIDE_PER_QUAD
    505             /*
    506              * a *= 1 / w
    507              */
    508 
    509             /*
    510              * XXX since we're only going to access elements 0,2 out of 8
    511              * if we have 8-wide vectors we should do the division only 4-wide.
    512              * a is really a 2-elements in a 4-wide vector disguised as 8-wide
    513              * in this case.
    514              */
    515             if (interp == LP_INTERP_PERSPECTIVE) {
    516                LLVMValueRef w = bld->a[0][3];
    517                assert(attrib != 0);
    518                assert(bld->mask[0] & TGSI_WRITEMASK_W);
    519                if (!bld->oow) {
    520                   bld->oow = lp_build_rcp(coeff_bld, w);
    521                   lp_build_name(bld->oow, "oow");
    522                }
    523                a = lp_build_mul(coeff_bld, a, bld->oow);
    524             }
    525 #endif
    526 
    527             attrib_name(a, attrib, chan, ".a");
    528             attrib_name(dadq, attrib, chan, ".dadq");
    529 
    530             if (bld->dynamic_offsets) {
    531                bld->a[attrib][chan] = lp_build_alloca(gallivm,
    532                                                       LLVMTypeOf(a), "");
    533                LLVMBuildStore(builder, a, bld->a[attrib][chan]);
    534             }
    535             else {
    536                bld->a[attrib][chan] = a;
    537             }
    538             bld->dadq[attrib][chan] = dadq;
    539          }
    540       }
    541    }
    542 }
    543 
    544 
    545 /**
    546  * Increment the shader input attribute values.
    547  * This is called when we move from one quad to the next.
    548  */
    549 static void
    550 attribs_update(struct lp_build_interp_soa_context *bld,
    551                struct gallivm_state *gallivm,
    552                int quad_start_index,
    553                LLVMValueRef loop_iter,
    554                int start,
    555                int end)
    556 {
    557    LLVMBuilderRef builder = gallivm->builder;
    558    struct lp_build_context *coeff_bld = &bld->coeff_bld;
    559    LLVMValueRef shuffle = lp_build_const_int_vec(gallivm, coeff_bld->type, quad_start_index);
    560    LLVMValueRef oow = NULL;
    561    unsigned attrib;
    562    unsigned chan;
    563 
    564    assert(quad_start_index < 4);
    565 
    566    for(attrib = start; attrib < end; ++attrib) {
    567       const unsigned mask = bld->mask[attrib];
    568       const unsigned interp = bld->interp[attrib];
    569       for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
    570          if(mask & (1 << chan)) {
    571             LLVMValueRef a;
    572             if (interp == LP_INTERP_CONSTANT ||
    573                 interp == LP_INTERP_FACING) {
    574                a = bld->a[attrib][chan];
    575                if (bld->dynamic_offsets) {
    576                   a = LLVMBuildLoad(builder, a, "");
    577                }
    578             }
    579             else if (interp == LP_INTERP_POSITION) {
    580                assert(attrib > 0);
    581                a = bld->attribs[0][chan];
    582             }
    583             else {
    584                LLVMValueRef dadq;
    585 
    586                a = bld->a[attrib][chan];
    587 
    588                /*
    589                 * Broadcast the attribute value for this quad into all elements
    590                 */
    591 
    592                if (bld->dynamic_offsets) {
    593                   /* stored as vector load as float */
    594                   LLVMTypeRef ptr_type = LLVMPointerType(LLVMFloatTypeInContext(
    595                                                             gallivm->context), 0);
    596                   LLVMValueRef ptr;
    597                   a = LLVMBuildBitCast(builder, a, ptr_type, "");
    598                   ptr = LLVMBuildGEP(builder, a, &loop_iter, 1, "");
    599                   a = LLVMBuildLoad(builder, ptr, "");
    600                   a = lp_build_broadcast_scalar(&bld->coeff_bld, a);
    601                }
    602                else {
    603                   a = LLVMBuildShuffleVector(builder,
    604                                              a, coeff_bld->undef, shuffle, "");
    605                }
    606 
    607                /*
    608                 * Get the derivatives.
    609                 */
    610 
    611                dadq = bld->dadq[attrib][chan];
    612 
    613 #if PERSPECTIVE_DIVIDE_PER_QUAD
    614                if (interp == LP_INTERP_PERSPECTIVE) {
    615                   LLVMValueRef dwdq = bld->dadq[0][3];
    616 
    617                   if (oow == NULL) {
    618                      assert(bld->oow);
    619                      oow = LLVMBuildShuffleVector(coeff_bld->builder,
    620                                                   bld->oow, coeff_bld->undef,
    621                                                   shuffle, "");
    622                   }
    623 
    624                   dadq = lp_build_sub(coeff_bld,
    625                                       dadq,
    626                                       lp_build_mul(coeff_bld, a, dwdq));
    627                   dadq = lp_build_mul(coeff_bld, dadq, oow);
    628                }
    629 #endif
    630 
    631                /*
    632                 * Add the derivatives
    633                 */
    634 
    635                a = lp_build_add(coeff_bld, a, dadq);
    636 
    637 #if !PERSPECTIVE_DIVIDE_PER_QUAD
    638                if (interp == LP_INTERP_PERSPECTIVE) {
    639                   if (oow == NULL) {
    640                      LLVMValueRef w = bld->attribs[0][3];
    641                      assert(attrib != 0);
    642                      assert(bld->mask[0] & TGSI_WRITEMASK_W);
    643                      oow = lp_build_rcp(coeff_bld, w);
    644                   }
    645                   a = lp_build_mul(coeff_bld, a, oow);
    646                }
    647 #endif
    648 
    649                if (attrib == 0 && chan == 2) {
    650                   /* FIXME: Depth values can exceed 1.0, due to the fact that
    651                    * setup interpolation coefficients refer to (0,0) which causes
    652                    * precision loss. So we must clamp to 1.0 here to avoid artifacts
    653                    */
    654                   a = lp_build_min(coeff_bld, a, coeff_bld->one);
    655                }
    656 
    657                attrib_name(a, attrib, chan, "");
    658             }
    659             bld->attribs[attrib][chan] = a;
    660          }
    661       }
    662    }
    663 }
    664 
    665 
    666 /**
    667  * Generate the position vectors.
    668  *
    669  * Parameter x0, y0 are the integer values with upper left coordinates.
    670  */
    671 static void
    672 pos_init(struct lp_build_interp_soa_context *bld,
    673          LLVMValueRef x0,
    674          LLVMValueRef y0)
    675 {
    676    LLVMBuilderRef builder = bld->coeff_bld.gallivm->builder;
    677    struct lp_build_context *coeff_bld = &bld->coeff_bld;
    678 
    679    bld->x = LLVMBuildSIToFP(builder, x0, coeff_bld->elem_type, "");
    680    bld->y = LLVMBuildSIToFP(builder, y0, coeff_bld->elem_type, "");
    681 }
    682 
    683 
    684 /**
    685  * Initialize fragment shader input attribute info.
    686  */
    687 void
    688 lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
    689                          struct gallivm_state *gallivm,
    690                          unsigned num_inputs,
    691                          const struct lp_shader_input *inputs,
    692                          LLVMBuilderRef builder,
    693                          struct lp_type type,
    694                          boolean dynamic_offsets,
    695                          LLVMValueRef a0_ptr,
    696                          LLVMValueRef dadx_ptr,
    697                          LLVMValueRef dady_ptr,
    698                          LLVMValueRef x0,
    699                          LLVMValueRef y0)
    700 {
    701    struct lp_type coeff_type;
    702    struct lp_type setup_type;
    703    unsigned attrib;
    704    unsigned chan;
    705 
    706    memset(bld, 0, sizeof *bld);
    707 
    708    memset(&coeff_type, 0, sizeof coeff_type);
    709    coeff_type.floating = TRUE;
    710    coeff_type.sign = TRUE;
    711    coeff_type.width = 32;
    712    coeff_type.length = type.length;
    713 
    714    memset(&setup_type, 0, sizeof setup_type);
    715    setup_type.floating = TRUE;
    716    setup_type.sign = TRUE;
    717    setup_type.width = 32;
    718    setup_type.length = TGSI_NUM_CHANNELS;
    719 
    720 
    721    /* XXX: we don't support interpolating into any other types */
    722    assert(memcmp(&coeff_type, &type, sizeof coeff_type) == 0);
    723 
    724    lp_build_context_init(&bld->coeff_bld, gallivm, coeff_type);
    725    lp_build_context_init(&bld->setup_bld, gallivm, setup_type);
    726 
    727    /* For convenience */
    728    bld->pos = bld->attribs[0];
    729    bld->inputs = (const LLVMValueRef (*)[TGSI_NUM_CHANNELS]) bld->attribs[1];
    730 
    731    /* Position */
    732    bld->mask[0] = TGSI_WRITEMASK_XYZW;
    733    bld->interp[0] = LP_INTERP_LINEAR;
    734 
    735    /* Inputs */
    736    for (attrib = 0; attrib < num_inputs; ++attrib) {
    737       bld->mask[1 + attrib] = inputs[attrib].usage_mask;
    738       bld->interp[1 + attrib] = inputs[attrib].interp;
    739    }
    740    bld->num_attribs = 1 + num_inputs;
    741 
    742    /* Ensure all masked out input channels have a valid value */
    743    for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
    744       for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
    745          bld->attribs[attrib][chan] = bld->coeff_bld.undef;
    746       }
    747    }
    748 
    749    pos_init(bld, x0, y0);
    750 
    751    if (coeff_type.length > 4) {
    752       bld->simple_interp = TRUE;
    753       if (dynamic_offsets) {
    754          /*XXXthis should use a global static table */
    755          unsigned i;
    756          unsigned num_loops = 16 / type.length;
    757          LLVMValueRef pixoffx, pixoffy, index;
    758          LLVMValueRef ptr;
    759 
    760          bld->dynamic_offsets = TRUE;
    761          bld->xoffset_store = lp_build_array_alloca(gallivm,
    762                                                     lp_build_vec_type(gallivm, type),
    763                                                     lp_build_const_int32(gallivm, num_loops),
    764                                                     "");
    765          bld->yoffset_store = lp_build_array_alloca(gallivm,
    766                                                     lp_build_vec_type(gallivm, type),
    767                                                     lp_build_const_int32(gallivm, num_loops),
    768                                                     "");
    769          for (i = 0; i < num_loops; i++) {
    770             index = lp_build_const_int32(gallivm, i);
    771             calc_offsets(&bld->coeff_bld, i*type.length/4, &pixoffx, &pixoffy);
    772             ptr = LLVMBuildGEP(builder, bld->xoffset_store, &index, 1, "");
    773             LLVMBuildStore(builder, pixoffx, ptr);
    774             ptr = LLVMBuildGEP(builder, bld->yoffset_store, &index, 1, "");
    775             LLVMBuildStore(builder, pixoffy, ptr);
    776          }
    777       }
    778       coeffs_init_simple(bld, a0_ptr, dadx_ptr, dady_ptr);
    779    }
    780    else {
    781       bld->simple_interp = FALSE;
    782       if (dynamic_offsets) {
    783          bld->dynamic_offsets = TRUE;
    784       }
    785       coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr);
    786    }
    787 
    788 }
    789 
    790 
    791 /**
    792  * Advance the position and inputs to the given quad within the block.
    793  */
    794 void
    795 lp_build_interp_soa_update_inputs(struct lp_build_interp_soa_context *bld,
    796                                   struct gallivm_state *gallivm,
    797                                   int quad_start_index)
    798 {
    799    assert(quad_start_index < 4);
    800 
    801    if (bld->simple_interp) {
    802       attribs_update_simple(bld, gallivm, quad_start_index, NULL, 1, bld->num_attribs);
    803    }
    804    else {
    805       attribs_update(bld, gallivm, quad_start_index, NULL, 1, bld->num_attribs);
    806    }
    807 }
    808 
    809 void
    810 lp_build_interp_soa_update_pos(struct lp_build_interp_soa_context *bld,
    811                                struct gallivm_state *gallivm,
    812                                int quad_start_index)
    813 {
    814    assert(quad_start_index < 4);
    815 
    816    if (bld->simple_interp) {
    817       attribs_update_simple(bld, gallivm, quad_start_index, NULL, 0, 1);
    818    }
    819    else {
    820       attribs_update(bld, gallivm, quad_start_index, NULL, 0, 1);
    821    }
    822 }
    823 
    824 void
    825 lp_build_interp_soa_update_inputs_dyn(struct lp_build_interp_soa_context *bld,
    826                                       struct gallivm_state *gallivm,
    827                                       LLVMValueRef quad_start_index)
    828 {
    829    if (bld->simple_interp) {
    830       attribs_update_simple(bld, gallivm, 0, quad_start_index, 1, bld->num_attribs);
    831    }
    832    else {
    833       attribs_update(bld, gallivm, 0, quad_start_index, 1, bld->num_attribs);
    834    }
    835 }
    836 
    837 void
    838 lp_build_interp_soa_update_pos_dyn(struct lp_build_interp_soa_context *bld,
    839                                    struct gallivm_state *gallivm,
    840                                    LLVMValueRef quad_start_index)
    841 {
    842    if (bld->simple_interp) {
    843       attribs_update_simple(bld, gallivm, 0, quad_start_index, 0, 1);
    844    }
    845    else {
    846       attribs_update(bld, gallivm, 0, quad_start_index, 0, 1);
    847    }
    848 }
    849 
    850