Home | History | Annotate | Download | only in tgsi
      1 /**************************************************************************
      2  *
      3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
      4  * All Rights Reserved.
      5  * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
      6  *
      7  * Permission is hereby granted, free of charge, to any person obtaining a
      8  * copy of this software and associated documentation files (the
      9  * "Software"), to deal in the Software without restriction, including
     10  * without limitation the rights to use, copy, modify, merge, publish,
     11  * distribute, sub license, and/or sell copies of the Software, and to
     12  * permit persons to whom the Software is furnished to do so, subject to
     13  * the following conditions:
     14  *
     15  * The above copyright notice and this permission notice (including the
     16  * next paragraph) shall be included in all copies or substantial portions
     17  * of the Software.
     18  *
     19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
     22  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
     23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     26  *
     27  **************************************************************************/
     28 
     29 /**
     30  * TGSI interpreter/executor.
     31  *
     32  * Flow control information:
     33  *
     34  * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
     35  * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
     36  * care since a condition may be true for some quad components but false
     37  * for other components.
     38  *
     39  * We basically execute all statements (even if they're in the part of
     40  * an IF/ELSE clause that's "not taken") and use a special mask to
     41  * control writing to destination registers.  This is the ExecMask.
     42  * See store_dest().
     43  *
     44  * The ExecMask is computed from three other masks (CondMask, LoopMask and
     45  * ContMask) which are controlled by the flow control instructions (namely:
     46  * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
     47  *
     48  *
     49  * Authors:
     50  *   Michal Krol
     51  *   Brian Paul
     52  */
     53 
     54 #include "pipe/p_compiler.h"
     55 #include "pipe/p_state.h"
     56 #include "pipe/p_shader_tokens.h"
     57 #include "tgsi/tgsi_dump.h"
     58 #include "tgsi/tgsi_parse.h"
     59 #include "tgsi/tgsi_util.h"
     60 #include "tgsi_exec.h"
     61 #include "util/u_memory.h"
     62 #include "util/u_math.h"
     63 
     64 
     65 #define FAST_MATH 0
     66 
     67 #define TILE_TOP_LEFT     0
     68 #define TILE_TOP_RIGHT    1
     69 #define TILE_BOTTOM_LEFT  2
     70 #define TILE_BOTTOM_RIGHT 3
     71 
     72 static void
     73 micro_abs(union tgsi_exec_channel *dst,
     74           const union tgsi_exec_channel *src)
     75 {
     76    dst->f[0] = fabsf(src->f[0]);
     77    dst->f[1] = fabsf(src->f[1]);
     78    dst->f[2] = fabsf(src->f[2]);
     79    dst->f[3] = fabsf(src->f[3]);
     80 }
     81 
     82 static void
     83 micro_arl(union tgsi_exec_channel *dst,
     84           const union tgsi_exec_channel *src)
     85 {
     86    dst->i[0] = (int)floorf(src->f[0]);
     87    dst->i[1] = (int)floorf(src->f[1]);
     88    dst->i[2] = (int)floorf(src->f[2]);
     89    dst->i[3] = (int)floorf(src->f[3]);
     90 }
     91 
     92 static void
     93 micro_arr(union tgsi_exec_channel *dst,
     94           const union tgsi_exec_channel *src)
     95 {
     96    dst->i[0] = (int)floorf(src->f[0] + 0.5f);
     97    dst->i[1] = (int)floorf(src->f[1] + 0.5f);
     98    dst->i[2] = (int)floorf(src->f[2] + 0.5f);
     99    dst->i[3] = (int)floorf(src->f[3] + 0.5f);
    100 }
    101 
    102 static void
    103 micro_ceil(union tgsi_exec_channel *dst,
    104            const union tgsi_exec_channel *src)
    105 {
    106    dst->f[0] = ceilf(src->f[0]);
    107    dst->f[1] = ceilf(src->f[1]);
    108    dst->f[2] = ceilf(src->f[2]);
    109    dst->f[3] = ceilf(src->f[3]);
    110 }
    111 
    112 static void
    113 micro_clamp(union tgsi_exec_channel *dst,
    114             const union tgsi_exec_channel *src0,
    115             const union tgsi_exec_channel *src1,
    116             const union tgsi_exec_channel *src2)
    117 {
    118    dst->f[0] = src0->f[0] < src1->f[0] ? src1->f[0] : src0->f[0] > src2->f[0] ? src2->f[0] : src0->f[0];
    119    dst->f[1] = src0->f[1] < src1->f[1] ? src1->f[1] : src0->f[1] > src2->f[1] ? src2->f[1] : src0->f[1];
    120    dst->f[2] = src0->f[2] < src1->f[2] ? src1->f[2] : src0->f[2] > src2->f[2] ? src2->f[2] : src0->f[2];
    121    dst->f[3] = src0->f[3] < src1->f[3] ? src1->f[3] : src0->f[3] > src2->f[3] ? src2->f[3] : src0->f[3];
    122 }
    123 
    124 static void
    125 micro_cmp(union tgsi_exec_channel *dst,
    126           const union tgsi_exec_channel *src0,
    127           const union tgsi_exec_channel *src1,
    128           const union tgsi_exec_channel *src2)
    129 {
    130    dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
    131    dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
    132    dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
    133    dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
    134 }
    135 
    136 static void
    137 micro_cnd(union tgsi_exec_channel *dst,
    138           const union tgsi_exec_channel *src0,
    139           const union tgsi_exec_channel *src1,
    140           const union tgsi_exec_channel *src2)
    141 {
    142    dst->f[0] = src2->f[0] > 0.5f ? src0->f[0] : src1->f[0];
    143    dst->f[1] = src2->f[1] > 0.5f ? src0->f[1] : src1->f[1];
    144    dst->f[2] = src2->f[2] > 0.5f ? src0->f[2] : src1->f[2];
    145    dst->f[3] = src2->f[3] > 0.5f ? src0->f[3] : src1->f[3];
    146 }
    147 
    148 static void
    149 micro_cos(union tgsi_exec_channel *dst,
    150           const union tgsi_exec_channel *src)
    151 {
    152    dst->f[0] = cosf(src->f[0]);
    153    dst->f[1] = cosf(src->f[1]);
    154    dst->f[2] = cosf(src->f[2]);
    155    dst->f[3] = cosf(src->f[3]);
    156 }
    157 
    158 static void
    159 micro_ddx(union tgsi_exec_channel *dst,
    160           const union tgsi_exec_channel *src)
    161 {
    162    dst->f[0] =
    163    dst->f[1] =
    164    dst->f[2] =
    165    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
    166 }
    167 
    168 static void
    169 micro_ddy(union tgsi_exec_channel *dst,
    170           const union tgsi_exec_channel *src)
    171 {
    172    dst->f[0] =
    173    dst->f[1] =
    174    dst->f[2] =
    175    dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
    176 }
    177 
    178 static void
    179 micro_exp2(union tgsi_exec_channel *dst,
    180            const union tgsi_exec_channel *src)
    181 {
    182 #if FAST_MATH
    183    dst->f[0] = util_fast_exp2(src->f[0]);
    184    dst->f[1] = util_fast_exp2(src->f[1]);
    185    dst->f[2] = util_fast_exp2(src->f[2]);
    186    dst->f[3] = util_fast_exp2(src->f[3]);
    187 #else
    188 #if DEBUG
    189    /* Inf is okay for this instruction, so clamp it to silence assertions. */
    190    uint i;
    191    union tgsi_exec_channel clamped;
    192 
    193    for (i = 0; i < 4; i++) {
    194       if (src->f[i] > 127.99999f) {
    195          clamped.f[i] = 127.99999f;
    196       } else if (src->f[i] < -126.99999f) {
    197          clamped.f[i] = -126.99999f;
    198       } else {
    199          clamped.f[i] = src->f[i];
    200       }
    201    }
    202    src = &clamped;
    203 #endif /* DEBUG */
    204 
    205    dst->f[0] = powf(2.0f, src->f[0]);
    206    dst->f[1] = powf(2.0f, src->f[1]);
    207    dst->f[2] = powf(2.0f, src->f[2]);
    208    dst->f[3] = powf(2.0f, src->f[3]);
    209 #endif /* FAST_MATH */
    210 }
    211 
    212 static void
    213 micro_flr(union tgsi_exec_channel *dst,
    214           const union tgsi_exec_channel *src)
    215 {
    216    dst->f[0] = floorf(src->f[0]);
    217    dst->f[1] = floorf(src->f[1]);
    218    dst->f[2] = floorf(src->f[2]);
    219    dst->f[3] = floorf(src->f[3]);
    220 }
    221 
    222 static void
    223 micro_frc(union tgsi_exec_channel *dst,
    224           const union tgsi_exec_channel *src)
    225 {
    226    dst->f[0] = src->f[0] - floorf(src->f[0]);
    227    dst->f[1] = src->f[1] - floorf(src->f[1]);
    228    dst->f[2] = src->f[2] - floorf(src->f[2]);
    229    dst->f[3] = src->f[3] - floorf(src->f[3]);
    230 }
    231 
    232 static void
    233 micro_iabs(union tgsi_exec_channel *dst,
    234            const union tgsi_exec_channel *src)
    235 {
    236    dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
    237    dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
    238    dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
    239    dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
    240 }
    241 
    242 static void
    243 micro_ineg(union tgsi_exec_channel *dst,
    244            const union tgsi_exec_channel *src)
    245 {
    246    dst->i[0] = -src->i[0];
    247    dst->i[1] = -src->i[1];
    248    dst->i[2] = -src->i[2];
    249    dst->i[3] = -src->i[3];
    250 }
    251 
    252 static void
    253 micro_lg2(union tgsi_exec_channel *dst,
    254           const union tgsi_exec_channel *src)
    255 {
    256 #if FAST_MATH
    257    dst->f[0] = util_fast_log2(src->f[0]);
    258    dst->f[1] = util_fast_log2(src->f[1]);
    259    dst->f[2] = util_fast_log2(src->f[2]);
    260    dst->f[3] = util_fast_log2(src->f[3]);
    261 #else
    262    dst->f[0] = logf(src->f[0]) * 1.442695f;
    263    dst->f[1] = logf(src->f[1]) * 1.442695f;
    264    dst->f[2] = logf(src->f[2]) * 1.442695f;
    265    dst->f[3] = logf(src->f[3]) * 1.442695f;
    266 #endif
    267 }
    268 
    269 static void
    270 micro_lrp(union tgsi_exec_channel *dst,
    271           const union tgsi_exec_channel *src0,
    272           const union tgsi_exec_channel *src1,
    273           const union tgsi_exec_channel *src2)
    274 {
    275    dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
    276    dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
    277    dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
    278    dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
    279 }
    280 
    281 static void
    282 micro_mad(union tgsi_exec_channel *dst,
    283           const union tgsi_exec_channel *src0,
    284           const union tgsi_exec_channel *src1,
    285           const union tgsi_exec_channel *src2)
    286 {
    287    dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
    288    dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
    289    dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
    290    dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
    291 }
    292 
    293 static void
    294 micro_mov(union tgsi_exec_channel *dst,
    295           const union tgsi_exec_channel *src)
    296 {
    297    dst->u[0] = src->u[0];
    298    dst->u[1] = src->u[1];
    299    dst->u[2] = src->u[2];
    300    dst->u[3] = src->u[3];
    301 }
    302 
    303 static void
    304 micro_rcp(union tgsi_exec_channel *dst,
    305           const union tgsi_exec_channel *src)
    306 {
    307 #if 0 /* for debugging */
    308    assert(src->f[0] != 0.0f);
    309    assert(src->f[1] != 0.0f);
    310    assert(src->f[2] != 0.0f);
    311    assert(src->f[3] != 0.0f);
    312 #endif
    313    dst->f[0] = 1.0f / src->f[0];
    314    dst->f[1] = 1.0f / src->f[1];
    315    dst->f[2] = 1.0f / src->f[2];
    316    dst->f[3] = 1.0f / src->f[3];
    317 }
    318 
    319 static void
    320 micro_rnd(union tgsi_exec_channel *dst,
    321           const union tgsi_exec_channel *src)
    322 {
    323    dst->f[0] = floorf(src->f[0] + 0.5f);
    324    dst->f[1] = floorf(src->f[1] + 0.5f);
    325    dst->f[2] = floorf(src->f[2] + 0.5f);
    326    dst->f[3] = floorf(src->f[3] + 0.5f);
    327 }
    328 
    329 static void
    330 micro_rsq(union tgsi_exec_channel *dst,
    331           const union tgsi_exec_channel *src)
    332 {
    333 #if 0 /* for debugging */
    334    assert(src->f[0] != 0.0f);
    335    assert(src->f[1] != 0.0f);
    336    assert(src->f[2] != 0.0f);
    337    assert(src->f[3] != 0.0f);
    338 #endif
    339    dst->f[0] = 1.0f / sqrtf(fabsf(src->f[0]));
    340    dst->f[1] = 1.0f / sqrtf(fabsf(src->f[1]));
    341    dst->f[2] = 1.0f / sqrtf(fabsf(src->f[2]));
    342    dst->f[3] = 1.0f / sqrtf(fabsf(src->f[3]));
    343 }
    344 
    345 static void
    346 micro_seq(union tgsi_exec_channel *dst,
    347           const union tgsi_exec_channel *src0,
    348           const union tgsi_exec_channel *src1)
    349 {
    350    dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
    351    dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
    352    dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
    353    dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
    354 }
    355 
    356 static void
    357 micro_sge(union tgsi_exec_channel *dst,
    358           const union tgsi_exec_channel *src0,
    359           const union tgsi_exec_channel *src1)
    360 {
    361    dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
    362    dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
    363    dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
    364    dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
    365 }
    366 
    367 static void
    368 micro_sgn(union tgsi_exec_channel *dst,
    369           const union tgsi_exec_channel *src)
    370 {
    371    dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
    372    dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
    373    dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
    374    dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
    375 }
    376 
    377 static void
    378 micro_isgn(union tgsi_exec_channel *dst,
    379           const union tgsi_exec_channel *src)
    380 {
    381    dst->i[0] = src->i[0] < 0 ? -1 : src->i[0] > 0 ? 1 : 0;
    382    dst->i[1] = src->i[1] < 0 ? -1 : src->i[1] > 0 ? 1 : 0;
    383    dst->i[2] = src->i[2] < 0 ? -1 : src->i[2] > 0 ? 1 : 0;
    384    dst->i[3] = src->i[3] < 0 ? -1 : src->i[3] > 0 ? 1 : 0;
    385 }
    386 
    387 static void
    388 micro_sgt(union tgsi_exec_channel *dst,
    389           const union tgsi_exec_channel *src0,
    390           const union tgsi_exec_channel *src1)
    391 {
    392    dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
    393    dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
    394    dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
    395    dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
    396 }
    397 
    398 static void
    399 micro_sin(union tgsi_exec_channel *dst,
    400           const union tgsi_exec_channel *src)
    401 {
    402    dst->f[0] = sinf(src->f[0]);
    403    dst->f[1] = sinf(src->f[1]);
    404    dst->f[2] = sinf(src->f[2]);
    405    dst->f[3] = sinf(src->f[3]);
    406 }
    407 
    408 static void
    409 micro_sle(union tgsi_exec_channel *dst,
    410           const union tgsi_exec_channel *src0,
    411           const union tgsi_exec_channel *src1)
    412 {
    413    dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
    414    dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
    415    dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
    416    dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
    417 }
    418 
    419 static void
    420 micro_slt(union tgsi_exec_channel *dst,
    421           const union tgsi_exec_channel *src0,
    422           const union tgsi_exec_channel *src1)
    423 {
    424    dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
    425    dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
    426    dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
    427    dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
    428 }
    429 
    430 static void
    431 micro_sne(union tgsi_exec_channel *dst,
    432           const union tgsi_exec_channel *src0,
    433           const union tgsi_exec_channel *src1)
    434 {
    435    dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
    436    dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
    437    dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
    438    dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
    439 }
    440 
    441 static void
    442 micro_sfl(union tgsi_exec_channel *dst)
    443 {
    444    dst->f[0] = 0.0f;
    445    dst->f[1] = 0.0f;
    446    dst->f[2] = 0.0f;
    447    dst->f[3] = 0.0f;
    448 }
    449 
    450 static void
    451 micro_str(union tgsi_exec_channel *dst)
    452 {
    453    dst->f[0] = 1.0f;
    454    dst->f[1] = 1.0f;
    455    dst->f[2] = 1.0f;
    456    dst->f[3] = 1.0f;
    457 }
    458 
    459 static void
    460 micro_trunc(union tgsi_exec_channel *dst,
    461             const union tgsi_exec_channel *src)
    462 {
    463    dst->f[0] = (float)(int)src->f[0];
    464    dst->f[1] = (float)(int)src->f[1];
    465    dst->f[2] = (float)(int)src->f[2];
    466    dst->f[3] = (float)(int)src->f[3];
    467 }
    468 
    469 
    470 enum tgsi_exec_datatype {
    471    TGSI_EXEC_DATA_FLOAT,
    472    TGSI_EXEC_DATA_INT,
    473    TGSI_EXEC_DATA_UINT
    474 };
    475 
    476 /*
    477  * Shorthand locations of various utility registers (_I = Index, _C = Channel)
    478  */
    479 #define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
    480 #define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
    481 #define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
    482 #define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
    483 #define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
    484 #define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
    485 
    486 
    487 /** The execution mask depends on the conditional mask and the loop mask */
    488 #define UPDATE_EXEC_MASK(MACH) \
    489       MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
    490 
    491 
    492 static const union tgsi_exec_channel ZeroVec =
    493    { { 0.0, 0.0, 0.0, 0.0 } };
    494 
    495 static const union tgsi_exec_channel OneVec = {
    496    {1.0f, 1.0f, 1.0f, 1.0f}
    497 };
    498 
    499 static const union tgsi_exec_channel P128Vec = {
    500    {128.0f, 128.0f, 128.0f, 128.0f}
    501 };
    502 
    503 static const union tgsi_exec_channel M128Vec = {
    504    {-128.0f, -128.0f, -128.0f, -128.0f}
    505 };
    506 
    507 
    508 /**
    509  * Assert that none of the float values in 'chan' are infinite or NaN.
    510  * NaN and Inf may occur normally during program execution and should
    511  * not lead to crashes, etc.  But when debugging, it's helpful to catch
    512  * them.
    513  */
    514 static INLINE void
    515 check_inf_or_nan(const union tgsi_exec_channel *chan)
    516 {
    517    assert(!util_is_inf_or_nan((chan)->f[0]));
    518    assert(!util_is_inf_or_nan((chan)->f[1]));
    519    assert(!util_is_inf_or_nan((chan)->f[2]));
    520    assert(!util_is_inf_or_nan((chan)->f[3]));
    521 }
    522 
    523 
    524 #ifdef DEBUG
    525 static void
    526 print_chan(const char *msg, const union tgsi_exec_channel *chan)
    527 {
    528    debug_printf("%s = {%f, %f, %f, %f}\n",
    529                 msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
    530 }
    531 #endif
    532 
    533 
    534 #ifdef DEBUG
    535 static void
    536 print_temp(const struct tgsi_exec_machine *mach, uint index)
    537 {
    538    const struct tgsi_exec_vector *tmp = &mach->Temps[index];
    539    int i;
    540    debug_printf("Temp[%u] =\n", index);
    541    for (i = 0; i < 4; i++) {
    542       debug_printf("  %c: { %f, %f, %f, %f }\n",
    543                    "XYZW"[i],
    544                    tmp->xyzw[i].f[0],
    545                    tmp->xyzw[i].f[1],
    546                    tmp->xyzw[i].f[2],
    547                    tmp->xyzw[i].f[3]);
    548    }
    549 }
    550 #endif
    551 
    552 
    553 void
    554 tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
    555                                unsigned num_bufs,
    556                                const void **bufs,
    557                                const unsigned *buf_sizes)
    558 {
    559    unsigned i;
    560 
    561    for (i = 0; i < num_bufs; i++) {
    562       mach->Consts[i] = bufs[i];
    563       mach->ConstsSize[i] = buf_sizes[i];
    564    }
    565 }
    566 
    567 
    568 /**
    569  * Check if there's a potential src/dst register data dependency when
    570  * using SOA execution.
    571  * Example:
    572  *   MOV T, T.yxwz;
    573  * This would expand into:
    574  *   MOV t0, t1;
    575  *   MOV t1, t0;
    576  *   MOV t2, t3;
    577  *   MOV t3, t2;
    578  * The second instruction will have the wrong value for t0 if executed as-is.
    579  */
    580 boolean
    581 tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
    582 {
    583    uint i, chan;
    584 
    585    uint writemask = inst->Dst[0].Register.WriteMask;
    586    if (writemask == TGSI_WRITEMASK_X ||
    587        writemask == TGSI_WRITEMASK_Y ||
    588        writemask == TGSI_WRITEMASK_Z ||
    589        writemask == TGSI_WRITEMASK_W ||
    590        writemask == TGSI_WRITEMASK_NONE) {
    591       /* no chance of data dependency */
    592       return FALSE;
    593    }
    594 
    595    /* loop over src regs */
    596    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
    597       if ((inst->Src[i].Register.File ==
    598            inst->Dst[0].Register.File) &&
    599           ((inst->Src[i].Register.Index ==
    600             inst->Dst[0].Register.Index) ||
    601            inst->Src[i].Register.Indirect ||
    602            inst->Dst[0].Register.Indirect)) {
    603          /* loop over dest channels */
    604          uint channelsWritten = 0x0;
    605          for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
    606             if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
    607                /* check if we're reading a channel that's been written */
    608                uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
    609                if (channelsWritten & (1 << swizzle)) {
    610                   return TRUE;
    611                }
    612 
    613                channelsWritten |= (1 << chan);
    614             }
    615          }
    616       }
    617    }
    618    return FALSE;
    619 }
    620 
    621 
    622 /**
    623  * Initialize machine state by expanding tokens to full instructions,
    624  * allocating temporary storage, setting up constants, etc.
    625  * After this, we can call tgsi_exec_machine_run() many times.
    626  */
    627 void
    628 tgsi_exec_machine_bind_shader(
    629    struct tgsi_exec_machine *mach,
    630    const struct tgsi_token *tokens,
    631    uint numSamplers,
    632    struct tgsi_sampler **samplers)
    633 {
    634    uint k;
    635    struct tgsi_parse_context parse;
    636    struct tgsi_full_instruction *instructions;
    637    struct tgsi_full_declaration *declarations;
    638    uint maxInstructions = 10, numInstructions = 0;
    639    uint maxDeclarations = 10, numDeclarations = 0;
    640 
    641 #if 0
    642    tgsi_dump(tokens, 0);
    643 #endif
    644 
    645    util_init_math();
    646 
    647    if (numSamplers) {
    648       assert(samplers);
    649    }
    650 
    651    mach->Tokens = tokens;
    652    mach->Samplers = samplers;
    653 
    654    if (!tokens) {
    655       /* unbind and free all */
    656       if (mach->Declarations) {
    657          FREE( mach->Declarations );
    658       }
    659       mach->Declarations = NULL;
    660       mach->NumDeclarations = 0;
    661 
    662       if (mach->Instructions) {
    663          FREE( mach->Instructions );
    664       }
    665       mach->Instructions = NULL;
    666       mach->NumInstructions = 0;
    667 
    668       return;
    669    }
    670 
    671    k = tgsi_parse_init (&parse, mach->Tokens);
    672    if (k != TGSI_PARSE_OK) {
    673       debug_printf( "Problem parsing!\n" );
    674       return;
    675    }
    676 
    677    mach->Processor = parse.FullHeader.Processor.Processor;
    678    mach->ImmLimit = 0;
    679 
    680    if (mach->Processor == TGSI_PROCESSOR_GEOMETRY &&
    681        !mach->UsedGeometryShader) {
    682       struct tgsi_exec_vector *inputs;
    683       struct tgsi_exec_vector *outputs;
    684 
    685       inputs = align_malloc(sizeof(struct tgsi_exec_vector) *
    686                             TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS,
    687                             16);
    688 
    689       if (!inputs)
    690          return;
    691 
    692       outputs = align_malloc(sizeof(struct tgsi_exec_vector) *
    693                              TGSI_MAX_TOTAL_VERTICES, 16);
    694 
    695       if (!outputs) {
    696          align_free(inputs);
    697          return;
    698       }
    699 
    700       align_free(mach->Inputs);
    701       align_free(mach->Outputs);
    702 
    703       mach->Inputs = inputs;
    704       mach->Outputs = outputs;
    705       mach->UsedGeometryShader = TRUE;
    706    }
    707 
    708    declarations = (struct tgsi_full_declaration *)
    709       MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
    710 
    711    if (!declarations) {
    712       return;
    713    }
    714 
    715    instructions = (struct tgsi_full_instruction *)
    716       MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
    717 
    718    if (!instructions) {
    719       FREE( declarations );
    720       return;
    721    }
    722 
    723    while( !tgsi_parse_end_of_tokens( &parse ) ) {
    724       uint i;
    725 
    726       tgsi_parse_token( &parse );
    727       switch( parse.FullToken.Token.Type ) {
    728       case TGSI_TOKEN_TYPE_DECLARATION:
    729          /* save expanded declaration */
    730          if (numDeclarations == maxDeclarations) {
    731             declarations = REALLOC(declarations,
    732                                    maxDeclarations
    733                                    * sizeof(struct tgsi_full_declaration),
    734                                    (maxDeclarations + 10)
    735                                    * sizeof(struct tgsi_full_declaration));
    736             maxDeclarations += 10;
    737          }
    738          if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
    739             unsigned reg;
    740             for (reg = parse.FullToken.FullDeclaration.Range.First;
    741                  reg <= parse.FullToken.FullDeclaration.Range.Last;
    742                  ++reg) {
    743                ++mach->NumOutputs;
    744             }
    745          }
    746          if (parse.FullToken.FullDeclaration.Declaration.File ==
    747              TGSI_FILE_IMMEDIATE_ARRAY) {
    748             unsigned reg;
    749             struct tgsi_full_declaration *decl =
    750                &parse.FullToken.FullDeclaration;
    751             debug_assert(decl->Range.Last < TGSI_EXEC_NUM_IMMEDIATES);
    752             for (reg = decl->Range.First; reg <= decl->Range.Last; ++reg) {
    753                for( i = 0; i < 4; i++ ) {
    754                   int idx = reg * 4 + i;
    755                   mach->ImmArray[reg][i] = decl->ImmediateData.u[idx].Float;
    756                }
    757             }
    758          }
    759          memcpy(declarations + numDeclarations,
    760                 &parse.FullToken.FullDeclaration,
    761                 sizeof(declarations[0]));
    762          numDeclarations++;
    763          break;
    764 
    765       case TGSI_TOKEN_TYPE_IMMEDIATE:
    766          {
    767             uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
    768             assert( size <= 4 );
    769             assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
    770 
    771             for( i = 0; i < size; i++ ) {
    772                mach->Imms[mach->ImmLimit][i] =
    773 		  parse.FullToken.FullImmediate.u[i].Float;
    774             }
    775             mach->ImmLimit += 1;
    776          }
    777          break;
    778 
    779       case TGSI_TOKEN_TYPE_INSTRUCTION:
    780 
    781          /* save expanded instruction */
    782          if (numInstructions == maxInstructions) {
    783             instructions = REALLOC(instructions,
    784                                    maxInstructions
    785                                    * sizeof(struct tgsi_full_instruction),
    786                                    (maxInstructions + 10)
    787                                    * sizeof(struct tgsi_full_instruction));
    788             maxInstructions += 10;
    789          }
    790 
    791          memcpy(instructions + numInstructions,
    792                 &parse.FullToken.FullInstruction,
    793                 sizeof(instructions[0]));
    794 
    795          numInstructions++;
    796          break;
    797 
    798       case TGSI_TOKEN_TYPE_PROPERTY:
    799          break;
    800 
    801       default:
    802          assert( 0 );
    803       }
    804    }
    805    tgsi_parse_free (&parse);
    806 
    807    if (mach->Declarations) {
    808       FREE( mach->Declarations );
    809    }
    810    mach->Declarations = declarations;
    811    mach->NumDeclarations = numDeclarations;
    812 
    813    if (mach->Instructions) {
    814       FREE( mach->Instructions );
    815    }
    816    mach->Instructions = instructions;
    817    mach->NumInstructions = numInstructions;
    818 }
    819 
    820 
    821 struct tgsi_exec_machine *
    822 tgsi_exec_machine_create( void )
    823 {
    824    struct tgsi_exec_machine *mach;
    825    uint i;
    826 
    827    mach = align_malloc( sizeof *mach, 16 );
    828    if (!mach)
    829       goto fail;
    830 
    831    memset(mach, 0, sizeof(*mach));
    832 
    833    mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
    834    mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
    835    mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
    836 
    837    mach->Inputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_ATTRIBS, 16);
    838    mach->Outputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_ATTRIBS, 16);
    839    if (!mach->Inputs || !mach->Outputs)
    840       goto fail;
    841 
    842    /* Setup constants needed by the SSE2 executor. */
    843    for( i = 0; i < 4; i++ ) {
    844       mach->Temps[TGSI_EXEC_TEMP_00000000_I].xyzw[TGSI_EXEC_TEMP_00000000_C].u[i] = 0x00000000;
    845       mach->Temps[TGSI_EXEC_TEMP_7FFFFFFF_I].xyzw[TGSI_EXEC_TEMP_7FFFFFFF_C].u[i] = 0x7FFFFFFF;
    846       mach->Temps[TGSI_EXEC_TEMP_80000000_I].xyzw[TGSI_EXEC_TEMP_80000000_C].u[i] = 0x80000000;
    847       mach->Temps[TGSI_EXEC_TEMP_FFFFFFFF_I].xyzw[TGSI_EXEC_TEMP_FFFFFFFF_C].u[i] = 0xFFFFFFFF;    /* not used */
    848       mach->Temps[TGSI_EXEC_TEMP_ONE_I].xyzw[TGSI_EXEC_TEMP_ONE_C].f[i] = 1.0f;
    849       mach->Temps[TGSI_EXEC_TEMP_TWO_I].xyzw[TGSI_EXEC_TEMP_TWO_C].f[i] = 2.0f;    /* not used */
    850       mach->Temps[TGSI_EXEC_TEMP_128_I].xyzw[TGSI_EXEC_TEMP_128_C].f[i] = 128.0f;
    851       mach->Temps[TGSI_EXEC_TEMP_MINUS_128_I].xyzw[TGSI_EXEC_TEMP_MINUS_128_C].f[i] = -128.0f;
    852       mach->Temps[TGSI_EXEC_TEMP_THREE_I].xyzw[TGSI_EXEC_TEMP_THREE_C].f[i] = 3.0f;
    853       mach->Temps[TGSI_EXEC_TEMP_HALF_I].xyzw[TGSI_EXEC_TEMP_HALF_C].f[i] = 0.5f;
    854    }
    855 
    856 #ifdef DEBUG
    857    /* silence warnings */
    858    (void) print_chan;
    859    (void) print_temp;
    860 #endif
    861 
    862    return mach;
    863 
    864 fail:
    865    if (mach) {
    866       align_free(mach->Inputs);
    867       align_free(mach->Outputs);
    868       align_free(mach);
    869    }
    870    return NULL;
    871 }
    872 
    873 
    874 void
    875 tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
    876 {
    877    if (mach) {
    878       if (mach->Instructions)
    879          FREE(mach->Instructions);
    880       if (mach->Declarations)
    881          FREE(mach->Declarations);
    882 
    883       align_free(mach->Inputs);
    884       align_free(mach->Outputs);
    885 
    886       align_free(mach);
    887    }
    888 }
    889 
    890 static void
    891 micro_add(union tgsi_exec_channel *dst,
    892           const union tgsi_exec_channel *src0,
    893           const union tgsi_exec_channel *src1)
    894 {
    895    dst->f[0] = src0->f[0] + src1->f[0];
    896    dst->f[1] = src0->f[1] + src1->f[1];
    897    dst->f[2] = src0->f[2] + src1->f[2];
    898    dst->f[3] = src0->f[3] + src1->f[3];
    899 }
    900 
    901 static void
    902 micro_div(
    903    union tgsi_exec_channel *dst,
    904    const union tgsi_exec_channel *src0,
    905    const union tgsi_exec_channel *src1 )
    906 {
    907    if (src1->f[0] != 0) {
    908       dst->f[0] = src0->f[0] / src1->f[0];
    909    }
    910    if (src1->f[1] != 0) {
    911       dst->f[1] = src0->f[1] / src1->f[1];
    912    }
    913    if (src1->f[2] != 0) {
    914       dst->f[2] = src0->f[2] / src1->f[2];
    915    }
    916    if (src1->f[3] != 0) {
    917       dst->f[3] = src0->f[3] / src1->f[3];
    918    }
    919 }
    920 
    921 static void
    922 micro_rcc(union tgsi_exec_channel *dst,
    923           const union tgsi_exec_channel *src)
    924 {
    925    uint i;
    926 
    927    for (i = 0; i < 4; i++) {
    928       float recip = 1.0f / src->f[i];
    929 
    930       if (recip > 0.0f) {
    931          if (recip > 1.884467e+019f) {
    932             dst->f[i] = 1.884467e+019f;
    933          }
    934          else if (recip < 5.42101e-020f) {
    935             dst->f[i] = 5.42101e-020f;
    936          }
    937          else {
    938             dst->f[i] = recip;
    939          }
    940       }
    941       else {
    942          if (recip < -1.884467e+019f) {
    943             dst->f[i] = -1.884467e+019f;
    944          }
    945          else if (recip > -5.42101e-020f) {
    946             dst->f[i] = -5.42101e-020f;
    947          }
    948          else {
    949             dst->f[i] = recip;
    950          }
    951       }
    952    }
    953 }
    954 
    955 static void
    956 micro_lt(
    957    union tgsi_exec_channel *dst,
    958    const union tgsi_exec_channel *src0,
    959    const union tgsi_exec_channel *src1,
    960    const union tgsi_exec_channel *src2,
    961    const union tgsi_exec_channel *src3 )
    962 {
    963    dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
    964    dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
    965    dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
    966    dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
    967 }
    968 
    969 static void
    970 micro_max(union tgsi_exec_channel *dst,
    971           const union tgsi_exec_channel *src0,
    972           const union tgsi_exec_channel *src1)
    973 {
    974    dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
    975    dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
    976    dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
    977    dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
    978 }
    979 
    980 static void
    981 micro_min(union tgsi_exec_channel *dst,
    982           const union tgsi_exec_channel *src0,
    983           const union tgsi_exec_channel *src1)
    984 {
    985    dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
    986    dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
    987    dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
    988    dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
    989 }
    990 
    991 static void
    992 micro_mul(union tgsi_exec_channel *dst,
    993           const union tgsi_exec_channel *src0,
    994           const union tgsi_exec_channel *src1)
    995 {
    996    dst->f[0] = src0->f[0] * src1->f[0];
    997    dst->f[1] = src0->f[1] * src1->f[1];
    998    dst->f[2] = src0->f[2] * src1->f[2];
    999    dst->f[3] = src0->f[3] * src1->f[3];
   1000 }
   1001 
   1002 static void
   1003 micro_neg(
   1004    union tgsi_exec_channel *dst,
   1005    const union tgsi_exec_channel *src )
   1006 {
   1007    dst->f[0] = -src->f[0];
   1008    dst->f[1] = -src->f[1];
   1009    dst->f[2] = -src->f[2];
   1010    dst->f[3] = -src->f[3];
   1011 }
   1012 
   1013 static void
   1014 micro_pow(
   1015    union tgsi_exec_channel *dst,
   1016    const union tgsi_exec_channel *src0,
   1017    const union tgsi_exec_channel *src1 )
   1018 {
   1019 #if FAST_MATH
   1020    dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
   1021    dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
   1022    dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
   1023    dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
   1024 #else
   1025    dst->f[0] = powf( src0->f[0], src1->f[0] );
   1026    dst->f[1] = powf( src0->f[1], src1->f[1] );
   1027    dst->f[2] = powf( src0->f[2], src1->f[2] );
   1028    dst->f[3] = powf( src0->f[3], src1->f[3] );
   1029 #endif
   1030 }
   1031 
   1032 static void
   1033 micro_sub(union tgsi_exec_channel *dst,
   1034           const union tgsi_exec_channel *src0,
   1035           const union tgsi_exec_channel *src1)
   1036 {
   1037    dst->f[0] = src0->f[0] - src1->f[0];
   1038    dst->f[1] = src0->f[1] - src1->f[1];
   1039    dst->f[2] = src0->f[2] - src1->f[2];
   1040    dst->f[3] = src0->f[3] - src1->f[3];
   1041 }
   1042 
   1043 static void
   1044 fetch_src_file_channel(const struct tgsi_exec_machine *mach,
   1045                        const uint chan_index,
   1046                        const uint file,
   1047                        const uint swizzle,
   1048                        const union tgsi_exec_channel *index,
   1049                        const union tgsi_exec_channel *index2D,
   1050                        union tgsi_exec_channel *chan)
   1051 {
   1052    uint i;
   1053 
   1054    assert(swizzle < 4);
   1055 
   1056    switch (file) {
   1057    case TGSI_FILE_CONSTANT:
   1058       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1059          assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
   1060          assert(mach->Consts[index2D->i[i]]);
   1061 
   1062          if (index->i[i] < 0) {
   1063             chan->u[i] = 0;
   1064          } else {
   1065             /* NOTE: copying the const value as a uint instead of float */
   1066             const uint constbuf = index2D->i[i];
   1067             const uint *buf = (const uint *)mach->Consts[constbuf];
   1068             const int pos = index->i[i] * 4 + swizzle;
   1069             /* const buffer bounds check */
   1070             if (pos < 0 || pos >= mach->ConstsSize[constbuf]) {
   1071                if (0) {
   1072                   /* Debug: print warning */
   1073                   static int count = 0;
   1074                   if (count++ < 100)
   1075                      debug_printf("TGSI Exec: const buffer index %d"
   1076                                   " out of bounds\n", pos);
   1077                }
   1078                chan->u[i] = 0;
   1079             }
   1080             else
   1081                chan->u[i] = buf[pos];
   1082          }
   1083       }
   1084       break;
   1085 
   1086    case TGSI_FILE_INPUT:
   1087       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1088          /*
   1089          if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
   1090             debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
   1091                          index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
   1092                          index2D->i[i], index->i[i]);
   1093                          }*/
   1094          int pos = index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i];
   1095          assert(pos >= 0);
   1096          assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
   1097          chan->u[i] = mach->Inputs[pos].xyzw[swizzle].u[i];
   1098       }
   1099       break;
   1100 
   1101    case TGSI_FILE_SYSTEM_VALUE:
   1102       /* XXX no swizzling at this point.  Will be needed if we put
   1103        * gl_FragCoord, for example, in a sys value register.
   1104        */
   1105       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1106          chan->u[i] = mach->SystemValue[index->i[i]].u[i];
   1107       }
   1108       break;
   1109 
   1110    case TGSI_FILE_TEMPORARY:
   1111       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1112          assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
   1113          assert(index2D->i[i] == 0);
   1114 
   1115          chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
   1116       }
   1117       break;
   1118 
   1119    case TGSI_FILE_TEMPORARY_ARRAY:
   1120       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1121          assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
   1122          assert(index2D->i[i] < TGSI_EXEC_NUM_TEMP_ARRAYS);
   1123 
   1124          chan->u[i] =
   1125             mach->TempArray[index2D->i[i]][index->i[i]].xyzw[swizzle].u[i];
   1126       }
   1127       break;
   1128 
   1129    case TGSI_FILE_IMMEDIATE:
   1130       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1131          assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
   1132          assert(index2D->i[i] == 0);
   1133 
   1134          chan->f[i] = mach->Imms[index->i[i]][swizzle];
   1135       }
   1136       break;
   1137 
   1138    case TGSI_FILE_IMMEDIATE_ARRAY:
   1139       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1140          assert(index2D->i[i] == 0);
   1141 
   1142          chan->f[i] = mach->ImmArray[index->i[i]][swizzle];
   1143       }
   1144       break;
   1145 
   1146    case TGSI_FILE_ADDRESS:
   1147       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1148          assert(index->i[i] >= 0);
   1149          assert(index2D->i[i] == 0);
   1150 
   1151          chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
   1152       }
   1153       break;
   1154 
   1155    case TGSI_FILE_PREDICATE:
   1156       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1157          assert(index->i[i] >= 0 && index->i[i] < TGSI_EXEC_NUM_PREDS);
   1158          assert(index2D->i[i] == 0);
   1159 
   1160          chan->u[i] = mach->Predicates[0].xyzw[swizzle].u[i];
   1161       }
   1162       break;
   1163 
   1164    case TGSI_FILE_OUTPUT:
   1165       /* vertex/fragment output vars can be read too */
   1166       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1167          assert(index->i[i] >= 0);
   1168          assert(index2D->i[i] == 0);
   1169 
   1170          chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
   1171       }
   1172       break;
   1173 
   1174    default:
   1175       assert(0);
   1176       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1177          chan->u[i] = 0;
   1178       }
   1179    }
   1180 }
   1181 
   1182 static void
   1183 fetch_source(const struct tgsi_exec_machine *mach,
   1184              union tgsi_exec_channel *chan,
   1185              const struct tgsi_full_src_register *reg,
   1186              const uint chan_index,
   1187              enum tgsi_exec_datatype src_datatype)
   1188 {
   1189    union tgsi_exec_channel index;
   1190    union tgsi_exec_channel index2D;
   1191    uint swizzle;
   1192 
   1193    /* We start with a direct index into a register file.
   1194     *
   1195     *    file[1],
   1196     *    where:
   1197     *       file = Register.File
   1198     *       [1] = Register.Index
   1199     */
   1200    index.i[0] =
   1201    index.i[1] =
   1202    index.i[2] =
   1203    index.i[3] = reg->Register.Index;
   1204 
   1205    /* There is an extra source register that indirectly subscripts
   1206     * a register file. The direct index now becomes an offset
   1207     * that is being added to the indirect register.
   1208     *
   1209     *    file[ind[2].x+1],
   1210     *    where:
   1211     *       ind = Indirect.File
   1212     *       [2] = Indirect.Index
   1213     *       .x = Indirect.SwizzleX
   1214     */
   1215    if (reg->Register.Indirect) {
   1216       union tgsi_exec_channel index2;
   1217       union tgsi_exec_channel indir_index;
   1218       const uint execmask = mach->ExecMask;
   1219       uint i;
   1220 
   1221       /* which address register (always zero now) */
   1222       index2.i[0] =
   1223       index2.i[1] =
   1224       index2.i[2] =
   1225       index2.i[3] = reg->Indirect.Index;
   1226       assert(reg->Indirect.File == TGSI_FILE_ADDRESS);
   1227       /* get current value of address register[swizzle] */
   1228       swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, TGSI_CHAN_X );
   1229       fetch_src_file_channel(mach,
   1230                              chan_index,
   1231                              reg->Indirect.File,
   1232                              swizzle,
   1233                              &index2,
   1234                              &ZeroVec,
   1235                              &indir_index);
   1236 
   1237       /* add value of address register to the offset */
   1238       index.i[0] += indir_index.i[0];
   1239       index.i[1] += indir_index.i[1];
   1240       index.i[2] += indir_index.i[2];
   1241       index.i[3] += indir_index.i[3];
   1242 
   1243       /* for disabled execution channels, zero-out the index to
   1244        * avoid using a potential garbage value.
   1245        */
   1246       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1247          if ((execmask & (1 << i)) == 0)
   1248             index.i[i] = 0;
   1249       }
   1250    }
   1251 
   1252    /* There is an extra source register that is a second
   1253     * subscript to a register file. Effectively it means that
   1254     * the register file is actually a 2D array of registers.
   1255     *
   1256     *    file[3][1],
   1257     *    where:
   1258     *       [3] = Dimension.Index
   1259     */
   1260    if (reg->Register.Dimension) {
   1261       index2D.i[0] =
   1262       index2D.i[1] =
   1263       index2D.i[2] =
   1264       index2D.i[3] = reg->Dimension.Index;
   1265 
   1266       /* Again, the second subscript index can be addressed indirectly
   1267        * identically to the first one.
   1268        * Nothing stops us from indirectly addressing the indirect register,
   1269        * but there is no need for that, so we won't exercise it.
   1270        *
   1271        *    file[ind[4].y+3][1],
   1272        *    where:
   1273        *       ind = DimIndirect.File
   1274        *       [4] = DimIndirect.Index
   1275        *       .y = DimIndirect.SwizzleX
   1276        */
   1277       if (reg->Dimension.Indirect) {
   1278          union tgsi_exec_channel index2;
   1279          union tgsi_exec_channel indir_index;
   1280          const uint execmask = mach->ExecMask;
   1281          uint i;
   1282 
   1283          index2.i[0] =
   1284          index2.i[1] =
   1285          index2.i[2] =
   1286          index2.i[3] = reg->DimIndirect.Index;
   1287 
   1288          swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, TGSI_CHAN_X );
   1289          fetch_src_file_channel(mach,
   1290                                 chan_index,
   1291                                 reg->DimIndirect.File,
   1292                                 swizzle,
   1293                                 &index2,
   1294                                 &ZeroVec,
   1295                                 &indir_index);
   1296 
   1297          index2D.i[0] += indir_index.i[0];
   1298          index2D.i[1] += indir_index.i[1];
   1299          index2D.i[2] += indir_index.i[2];
   1300          index2D.i[3] += indir_index.i[3];
   1301 
   1302          /* for disabled execution channels, zero-out the index to
   1303           * avoid using a potential garbage value.
   1304           */
   1305          for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1306             if ((execmask & (1 << i)) == 0) {
   1307                index2D.i[i] = 0;
   1308             }
   1309          }
   1310       }
   1311 
   1312       /* If by any chance there was a need for a 3D array of register
   1313        * files, we would have to check whether Dimension is followed
   1314        * by a dimension register and continue the saga.
   1315        */
   1316    } else {
   1317       index2D.i[0] =
   1318       index2D.i[1] =
   1319       index2D.i[2] =
   1320       index2D.i[3] = 0;
   1321    }
   1322 
   1323    swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
   1324    fetch_src_file_channel(mach,
   1325                           chan_index,
   1326                           reg->Register.File,
   1327                           swizzle,
   1328                           &index,
   1329                           &index2D,
   1330                           chan);
   1331 
   1332    if (reg->Register.Absolute) {
   1333       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
   1334          micro_abs(chan, chan);
   1335       } else {
   1336          micro_iabs(chan, chan);
   1337       }
   1338    }
   1339 
   1340    if (reg->Register.Negate) {
   1341       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
   1342          micro_neg(chan, chan);
   1343       } else {
   1344          micro_ineg(chan, chan);
   1345       }
   1346    }
   1347 }
   1348 
   1349 static void
   1350 store_dest(struct tgsi_exec_machine *mach,
   1351            const union tgsi_exec_channel *chan,
   1352            const struct tgsi_full_dst_register *reg,
   1353            const struct tgsi_full_instruction *inst,
   1354            uint chan_index,
   1355            enum tgsi_exec_datatype dst_datatype)
   1356 {
   1357    uint i;
   1358    union tgsi_exec_channel null;
   1359    union tgsi_exec_channel *dst;
   1360    union tgsi_exec_channel index2D;
   1361    uint execmask = mach->ExecMask;
   1362    int offset = 0;  /* indirection offset */
   1363    int index;
   1364 
   1365    /* for debugging */
   1366    if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
   1367       check_inf_or_nan(chan);
   1368    }
   1369 
   1370    /* There is an extra source register that indirectly subscripts
   1371     * a register file. The direct index now becomes an offset
   1372     * that is being added to the indirect register.
   1373     *
   1374     *    file[ind[2].x+1],
   1375     *    where:
   1376     *       ind = Indirect.File
   1377     *       [2] = Indirect.Index
   1378     *       .x = Indirect.SwizzleX
   1379     */
   1380    if (reg->Register.Indirect) {
   1381       union tgsi_exec_channel index;
   1382       union tgsi_exec_channel indir_index;
   1383       uint swizzle;
   1384 
   1385       /* which address register (always zero for now) */
   1386       index.i[0] =
   1387       index.i[1] =
   1388       index.i[2] =
   1389       index.i[3] = reg->Indirect.Index;
   1390 
   1391       /* get current value of address register[swizzle] */
   1392       swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, TGSI_CHAN_X );
   1393 
   1394       /* fetch values from the address/indirection register */
   1395       fetch_src_file_channel(mach,
   1396                              chan_index,
   1397                              reg->Indirect.File,
   1398                              swizzle,
   1399                              &index,
   1400                              &ZeroVec,
   1401                              &indir_index);
   1402 
   1403       /* save indirection offset */
   1404       offset = indir_index.i[0];
   1405    }
   1406 
   1407    /* There is an extra source register that is a second
   1408     * subscript to a register file. Effectively it means that
   1409     * the register file is actually a 2D array of registers.
   1410     *
   1411     *    file[3][1],
   1412     *    where:
   1413     *       [3] = Dimension.Index
   1414     */
   1415    if (reg->Register.Dimension) {
   1416       index2D.i[0] =
   1417       index2D.i[1] =
   1418       index2D.i[2] =
   1419       index2D.i[3] = reg->Dimension.Index;
   1420 
   1421       /* Again, the second subscript index can be addressed indirectly
   1422        * identically to the first one.
   1423        * Nothing stops us from indirectly addressing the indirect register,
   1424        * but there is no need for that, so we won't exercise it.
   1425        *
   1426        *    file[ind[4].y+3][1],
   1427        *    where:
   1428        *       ind = DimIndirect.File
   1429        *       [4] = DimIndirect.Index
   1430        *       .y = DimIndirect.SwizzleX
   1431        */
   1432       if (reg->Dimension.Indirect) {
   1433          union tgsi_exec_channel index2;
   1434          union tgsi_exec_channel indir_index;
   1435          const uint execmask = mach->ExecMask;
   1436          unsigned swizzle;
   1437          uint i;
   1438 
   1439          index2.i[0] =
   1440          index2.i[1] =
   1441          index2.i[2] =
   1442          index2.i[3] = reg->DimIndirect.Index;
   1443 
   1444          swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, TGSI_CHAN_X );
   1445          fetch_src_file_channel(mach,
   1446                                 chan_index,
   1447                                 reg->DimIndirect.File,
   1448                                 swizzle,
   1449                                 &index2,
   1450                                 &ZeroVec,
   1451                                 &indir_index);
   1452 
   1453          index2D.i[0] += indir_index.i[0];
   1454          index2D.i[1] += indir_index.i[1];
   1455          index2D.i[2] += indir_index.i[2];
   1456          index2D.i[3] += indir_index.i[3];
   1457 
   1458          /* for disabled execution channels, zero-out the index to
   1459           * avoid using a potential garbage value.
   1460           */
   1461          for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1462             if ((execmask & (1 << i)) == 0) {
   1463                index2D.i[i] = 0;
   1464             }
   1465          }
   1466       }
   1467 
   1468       /* If by any chance there was a need for a 3D array of register
   1469        * files, we would have to check whether Dimension is followed
   1470        * by a dimension register and continue the saga.
   1471        */
   1472    } else {
   1473       index2D.i[0] =
   1474       index2D.i[1] =
   1475       index2D.i[2] =
   1476       index2D.i[3] = 0;
   1477    }
   1478 
   1479    switch (reg->Register.File) {
   1480    case TGSI_FILE_NULL:
   1481       dst = &null;
   1482       break;
   1483 
   1484    case TGSI_FILE_OUTPUT:
   1485       index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
   1486          + reg->Register.Index;
   1487       dst = &mach->Outputs[offset + index].xyzw[chan_index];
   1488 #if 0
   1489       if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
   1490          fprintf(stderr, "STORING OUT[%d] mask(%d), = (", offset + index, execmask);
   1491          for (i = 0; i < TGSI_QUAD_SIZE; i++)
   1492             if (execmask & (1 << i))
   1493                fprintf(stderr, "%f, ", chan->f[i]);
   1494          fprintf(stderr, ")\n");
   1495       }
   1496 #endif
   1497       break;
   1498 
   1499    case TGSI_FILE_TEMPORARY:
   1500       index = reg->Register.Index;
   1501       assert( index < TGSI_EXEC_NUM_TEMPS );
   1502       dst = &mach->Temps[offset + index].xyzw[chan_index];
   1503       break;
   1504 
   1505    case TGSI_FILE_TEMPORARY_ARRAY:
   1506       index = reg->Register.Index;
   1507       assert( index < TGSI_EXEC_NUM_TEMPS );
   1508       assert( index2D.i[0] < TGSI_EXEC_NUM_TEMP_ARRAYS );
   1509       /* XXX we use index2D.i[0] here but somehow we might
   1510        * end up with someone trying to store indirectly in
   1511        * different buffers */
   1512       dst = &mach->TempArray[index2D.i[0]][offset + index].xyzw[chan_index];
   1513       break;
   1514 
   1515    case TGSI_FILE_ADDRESS:
   1516       index = reg->Register.Index;
   1517       dst = &mach->Addrs[index].xyzw[chan_index];
   1518       break;
   1519 
   1520    case TGSI_FILE_PREDICATE:
   1521       index = reg->Register.Index;
   1522       assert(index < TGSI_EXEC_NUM_PREDS);
   1523       dst = &mach->Predicates[index].xyzw[chan_index];
   1524       break;
   1525 
   1526    default:
   1527       assert( 0 );
   1528       return;
   1529    }
   1530 
   1531    if (inst->Instruction.Predicate) {
   1532       uint swizzle;
   1533       union tgsi_exec_channel *pred;
   1534 
   1535       switch (chan_index) {
   1536       case TGSI_CHAN_X:
   1537          swizzle = inst->Predicate.SwizzleX;
   1538          break;
   1539       case TGSI_CHAN_Y:
   1540          swizzle = inst->Predicate.SwizzleY;
   1541          break;
   1542       case TGSI_CHAN_Z:
   1543          swizzle = inst->Predicate.SwizzleZ;
   1544          break;
   1545       case TGSI_CHAN_W:
   1546          swizzle = inst->Predicate.SwizzleW;
   1547          break;
   1548       default:
   1549          assert(0);
   1550          return;
   1551       }
   1552 
   1553       assert(inst->Predicate.Index == 0);
   1554 
   1555       pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
   1556 
   1557       if (inst->Predicate.Negate) {
   1558          for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1559             if (pred->u[i]) {
   1560                execmask &= ~(1 << i);
   1561             }
   1562          }
   1563       } else {
   1564          for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1565             if (!pred->u[i]) {
   1566                execmask &= ~(1 << i);
   1567             }
   1568          }
   1569       }
   1570    }
   1571 
   1572    switch (inst->Instruction.Saturate) {
   1573    case TGSI_SAT_NONE:
   1574       for (i = 0; i < TGSI_QUAD_SIZE; i++)
   1575          if (execmask & (1 << i))
   1576             dst->i[i] = chan->i[i];
   1577       break;
   1578 
   1579    case TGSI_SAT_ZERO_ONE:
   1580       for (i = 0; i < TGSI_QUAD_SIZE; i++)
   1581          if (execmask & (1 << i)) {
   1582             if (chan->f[i] < 0.0f)
   1583                dst->f[i] = 0.0f;
   1584             else if (chan->f[i] > 1.0f)
   1585                dst->f[i] = 1.0f;
   1586             else
   1587                dst->i[i] = chan->i[i];
   1588          }
   1589       break;
   1590 
   1591    case TGSI_SAT_MINUS_PLUS_ONE:
   1592       for (i = 0; i < TGSI_QUAD_SIZE; i++)
   1593          if (execmask & (1 << i)) {
   1594             if (chan->f[i] < -1.0f)
   1595                dst->f[i] = -1.0f;
   1596             else if (chan->f[i] > 1.0f)
   1597                dst->f[i] = 1.0f;
   1598             else
   1599                dst->i[i] = chan->i[i];
   1600          }
   1601       break;
   1602 
   1603    default:
   1604       assert( 0 );
   1605    }
   1606 }
   1607 
   1608 #define FETCH(VAL,INDEX,CHAN)\
   1609     fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
   1610 
   1611 #define IFETCH(VAL,INDEX,CHAN)\
   1612     fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_INT)
   1613 
   1614 
   1615 /**
   1616  * Execute ARB-style KIL which is predicated by a src register.
   1617  * Kill fragment if any of the four values is less than zero.
   1618  */
   1619 static void
   1620 exec_kil(struct tgsi_exec_machine *mach,
   1621          const struct tgsi_full_instruction *inst)
   1622 {
   1623    uint uniquemask;
   1624    uint chan_index;
   1625    uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
   1626    union tgsi_exec_channel r[1];
   1627 
   1628    /* This mask stores component bits that were already tested. */
   1629    uniquemask = 0;
   1630 
   1631    for (chan_index = 0; chan_index < 4; chan_index++)
   1632    {
   1633       uint swizzle;
   1634       uint i;
   1635 
   1636       /* unswizzle channel */
   1637       swizzle = tgsi_util_get_full_src_register_swizzle (
   1638                         &inst->Src[0],
   1639                         chan_index);
   1640 
   1641       /* check if the component has not been already tested */
   1642       if (uniquemask & (1 << swizzle))
   1643          continue;
   1644       uniquemask |= 1 << swizzle;
   1645 
   1646       FETCH(&r[0], 0, chan_index);
   1647       for (i = 0; i < 4; i++)
   1648          if (r[0].f[i] < 0.0f)
   1649             kilmask |= 1 << i;
   1650    }
   1651 
   1652    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
   1653 }
   1654 
   1655 /**
   1656  * Execute NVIDIA-style KIL which is predicated by a condition code.
   1657  * Kill fragment if the condition code is TRUE.
   1658  */
   1659 static void
   1660 exec_kilp(struct tgsi_exec_machine *mach,
   1661           const struct tgsi_full_instruction *inst)
   1662 {
   1663    uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
   1664 
   1665    /* "unconditional" kil */
   1666    kilmask = mach->ExecMask;
   1667    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
   1668 }
   1669 
   1670 static void
   1671 emit_vertex(struct tgsi_exec_machine *mach)
   1672 {
   1673    /* FIXME: check for exec mask correctly
   1674    unsigned i;
   1675    for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
   1676          if ((mach->ExecMask & (1 << i)))
   1677    */
   1678    if (mach->ExecMask) {
   1679       mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
   1680       mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
   1681    }
   1682 }
   1683 
   1684 static void
   1685 emit_primitive(struct tgsi_exec_machine *mach)
   1686 {
   1687    unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
   1688    /* FIXME: check for exec mask correctly
   1689    unsigned i;
   1690    for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
   1691          if ((mach->ExecMask & (1 << i)))
   1692    */
   1693    if (mach->ExecMask) {
   1694       ++(*prim_count);
   1695       debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
   1696       mach->Primitives[*prim_count] = 0;
   1697    }
   1698 }
   1699 
   1700 static void
   1701 conditional_emit_primitive(struct tgsi_exec_machine *mach)
   1702 {
   1703    if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
   1704       int emitted_verts =
   1705          mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]];
   1706       if (emitted_verts) {
   1707          emit_primitive(mach);
   1708       }
   1709    }
   1710 }
   1711 
   1712 
   1713 /*
   1714  * Fetch four texture samples using STR texture coordinates.
   1715  */
   1716 static void
   1717 fetch_texel( struct tgsi_sampler *sampler,
   1718              const union tgsi_exec_channel *s,
   1719              const union tgsi_exec_channel *t,
   1720              const union tgsi_exec_channel *p,
   1721              const union tgsi_exec_channel *c0,
   1722              enum tgsi_sampler_control control,
   1723              union tgsi_exec_channel *r,
   1724              union tgsi_exec_channel *g,
   1725              union tgsi_exec_channel *b,
   1726              union tgsi_exec_channel *a )
   1727 {
   1728    uint j;
   1729    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
   1730 
   1731    sampler->get_samples(sampler, s->f, t->f, p->f, c0->f, control, rgba);
   1732 
   1733    for (j = 0; j < 4; j++) {
   1734       r->f[j] = rgba[0][j];
   1735       g->f[j] = rgba[1][j];
   1736       b->f[j] = rgba[2][j];
   1737       a->f[j] = rgba[3][j];
   1738    }
   1739 }
   1740 
   1741 
   1742 #define TEX_MODIFIER_NONE           0
   1743 #define TEX_MODIFIER_PROJECTED      1
   1744 #define TEX_MODIFIER_LOD_BIAS       2
   1745 #define TEX_MODIFIER_EXPLICIT_LOD   3
   1746 
   1747 
   1748 static void
   1749 exec_tex(struct tgsi_exec_machine *mach,
   1750          const struct tgsi_full_instruction *inst,
   1751          uint modifier)
   1752 {
   1753    const uint unit = inst->Src[1].Register.Index;
   1754    union tgsi_exec_channel r[4];
   1755    const union tgsi_exec_channel *lod = &ZeroVec;
   1756    enum tgsi_sampler_control control;
   1757    uint chan;
   1758 
   1759    if (modifier != TEX_MODIFIER_NONE) {
   1760       FETCH(&r[3], 0, TGSI_CHAN_W);
   1761       if (modifier != TEX_MODIFIER_PROJECTED) {
   1762          lod = &r[3];
   1763       }
   1764    }
   1765 
   1766    if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
   1767       control = tgsi_sampler_lod_explicit;
   1768    } else {
   1769       control = tgsi_sampler_lod_bias;
   1770    }
   1771 
   1772    switch (inst->Texture.Texture) {
   1773    case TGSI_TEXTURE_1D:
   1774       FETCH(&r[0], 0, TGSI_CHAN_X);
   1775 
   1776       if (modifier == TEX_MODIFIER_PROJECTED) {
   1777          micro_div(&r[0], &r[0], &r[3]);
   1778       }
   1779 
   1780       fetch_texel(mach->Samplers[unit],
   1781                   &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
   1782                   control,
   1783                   &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
   1784       break;
   1785    case TGSI_TEXTURE_SHADOW1D:
   1786       FETCH(&r[0], 0, TGSI_CHAN_X);
   1787       FETCH(&r[2], 0, TGSI_CHAN_Z);
   1788 
   1789       if (modifier == TEX_MODIFIER_PROJECTED) {
   1790          micro_div(&r[0], &r[0], &r[3]);
   1791       }
   1792 
   1793       fetch_texel(mach->Samplers[unit],
   1794                   &r[0], &ZeroVec, &r[2], lod,  /* S, T, P, LOD */
   1795                   control,
   1796                   &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
   1797       break;
   1798 
   1799    case TGSI_TEXTURE_2D:
   1800    case TGSI_TEXTURE_RECT:
   1801    case TGSI_TEXTURE_SHADOW2D:
   1802    case TGSI_TEXTURE_SHADOWRECT:
   1803       FETCH(&r[0], 0, TGSI_CHAN_X);
   1804       FETCH(&r[1], 0, TGSI_CHAN_Y);
   1805       FETCH(&r[2], 0, TGSI_CHAN_Z);
   1806 
   1807       if (modifier == TEX_MODIFIER_PROJECTED) {
   1808          micro_div(&r[0], &r[0], &r[3]);
   1809          micro_div(&r[1], &r[1], &r[3]);
   1810          micro_div(&r[2], &r[2], &r[3]);
   1811       }
   1812 
   1813       fetch_texel(mach->Samplers[unit],
   1814                   &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
   1815                   control,
   1816                   &r[0], &r[1], &r[2], &r[3]);  /* outputs */
   1817       break;
   1818 
   1819    case TGSI_TEXTURE_1D_ARRAY:
   1820       FETCH(&r[0], 0, TGSI_CHAN_X);
   1821       FETCH(&r[1], 0, TGSI_CHAN_Y);
   1822 
   1823       if (modifier == TEX_MODIFIER_PROJECTED) {
   1824          micro_div(&r[0], &r[0], &r[3]);
   1825       }
   1826 
   1827       fetch_texel(mach->Samplers[unit],
   1828                   &r[0], &r[1], &ZeroVec, lod,     /* S, T, P, LOD */
   1829                   control,
   1830                   &r[0], &r[1], &r[2], &r[3]);  /* outputs */
   1831       break;
   1832    case TGSI_TEXTURE_SHADOW1D_ARRAY:
   1833       FETCH(&r[0], 0, TGSI_CHAN_X);
   1834       FETCH(&r[1], 0, TGSI_CHAN_Y);
   1835       FETCH(&r[2], 0, TGSI_CHAN_Z);
   1836 
   1837       if (modifier == TEX_MODIFIER_PROJECTED) {
   1838          micro_div(&r[0], &r[0], &r[3]);
   1839       }
   1840 
   1841       fetch_texel(mach->Samplers[unit],
   1842                   &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
   1843                   control,
   1844                   &r[0], &r[1], &r[2], &r[3]);  /* outputs */
   1845       break;
   1846 
   1847    case TGSI_TEXTURE_2D_ARRAY:
   1848       FETCH(&r[0], 0, TGSI_CHAN_X);
   1849       FETCH(&r[1], 0, TGSI_CHAN_Y);
   1850       FETCH(&r[2], 0, TGSI_CHAN_Z);
   1851 
   1852       if (modifier == TEX_MODIFIER_PROJECTED) {
   1853          micro_div(&r[0], &r[0], &r[3]);
   1854          micro_div(&r[1], &r[1], &r[3]);
   1855       }
   1856 
   1857       fetch_texel(mach->Samplers[unit],
   1858                   &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
   1859                   control,
   1860                   &r[0], &r[1], &r[2], &r[3]);  /* outputs */
   1861       break;
   1862    case TGSI_TEXTURE_SHADOW2D_ARRAY:
   1863    case TGSI_TEXTURE_SHADOWCUBE:
   1864       FETCH(&r[0], 0, TGSI_CHAN_X);
   1865       FETCH(&r[1], 0, TGSI_CHAN_Y);
   1866       FETCH(&r[2], 0, TGSI_CHAN_Z);
   1867       FETCH(&r[3], 0, TGSI_CHAN_W);
   1868 
   1869       fetch_texel(mach->Samplers[unit],
   1870                   &r[0], &r[1], &r[2], &r[3],     /* S, T, P, LOD */
   1871                   control,
   1872                   &r[0], &r[1], &r[2], &r[3]);  /* outputs */
   1873       break;
   1874    case TGSI_TEXTURE_3D:
   1875    case TGSI_TEXTURE_CUBE:
   1876       FETCH(&r[0], 0, TGSI_CHAN_X);
   1877       FETCH(&r[1], 0, TGSI_CHAN_Y);
   1878       FETCH(&r[2], 0, TGSI_CHAN_Z);
   1879 
   1880       if (modifier == TEX_MODIFIER_PROJECTED) {
   1881          micro_div(&r[0], &r[0], &r[3]);
   1882          micro_div(&r[1], &r[1], &r[3]);
   1883          micro_div(&r[2], &r[2], &r[3]);
   1884       }
   1885 
   1886       fetch_texel(mach->Samplers[unit],
   1887                   &r[0], &r[1], &r[2], lod,
   1888                   control,
   1889                   &r[0], &r[1], &r[2], &r[3]);
   1890       break;
   1891 
   1892    default:
   1893       assert(0);
   1894    }
   1895 
   1896 #if 0
   1897    debug_printf("fetch r: %g %g %g %g\n",
   1898          r[0].f[0], r[0].f[1], r[0].f[2], r[0].f[3]);
   1899    debug_printf("fetch g: %g %g %g %g\n",
   1900          r[1].f[0], r[1].f[1], r[1].f[2], r[1].f[3]);
   1901    debug_printf("fetch b: %g %g %g %g\n",
   1902          r[2].f[0], r[2].f[1], r[2].f[2], r[2].f[3]);
   1903    debug_printf("fetch a: %g %g %g %g\n",
   1904          r[3].f[0], r[3].f[1], r[3].f[2], r[3].f[3]);
   1905 #endif
   1906 
   1907    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   1908       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   1909          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   1910       }
   1911    }
   1912 }
   1913 
   1914 static void
   1915 exec_txd(struct tgsi_exec_machine *mach,
   1916          const struct tgsi_full_instruction *inst)
   1917 {
   1918    const uint unit = inst->Src[3].Register.Index;
   1919    union tgsi_exec_channel r[4];
   1920    uint chan;
   1921 
   1922    /*
   1923     * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
   1924     */
   1925 
   1926    switch (inst->Texture.Texture) {
   1927    case TGSI_TEXTURE_1D:
   1928    case TGSI_TEXTURE_SHADOW1D:
   1929 
   1930       FETCH(&r[0], 0, TGSI_CHAN_X);
   1931 
   1932       fetch_texel(mach->Samplers[unit],
   1933                   &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
   1934                   tgsi_sampler_lod_bias,
   1935                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
   1936       break;
   1937 
   1938    case TGSI_TEXTURE_1D_ARRAY:
   1939    case TGSI_TEXTURE_2D:
   1940    case TGSI_TEXTURE_RECT:
   1941    case TGSI_TEXTURE_SHADOW1D_ARRAY:
   1942    case TGSI_TEXTURE_SHADOW2D:
   1943    case TGSI_TEXTURE_SHADOWRECT:
   1944 
   1945       FETCH(&r[0], 0, TGSI_CHAN_X);
   1946       FETCH(&r[1], 0, TGSI_CHAN_Y);
   1947       FETCH(&r[2], 0, TGSI_CHAN_Z);
   1948 
   1949       fetch_texel(mach->Samplers[unit],
   1950                   &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
   1951                   tgsi_sampler_lod_bias,
   1952                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
   1953       break;
   1954 
   1955    case TGSI_TEXTURE_2D_ARRAY:
   1956    case TGSI_TEXTURE_3D:
   1957    case TGSI_TEXTURE_CUBE:
   1958 
   1959       FETCH(&r[0], 0, TGSI_CHAN_X);
   1960       FETCH(&r[1], 0, TGSI_CHAN_Y);
   1961       FETCH(&r[2], 0, TGSI_CHAN_Z);
   1962 
   1963       fetch_texel(mach->Samplers[unit],
   1964                   &r[0], &r[1], &r[2], &ZeroVec,
   1965                   tgsi_sampler_lod_bias,
   1966                   &r[0], &r[1], &r[2], &r[3]);
   1967       break;
   1968 
   1969    case TGSI_TEXTURE_SHADOW2D_ARRAY:
   1970 
   1971       FETCH(&r[0], 0, TGSI_CHAN_X);
   1972       FETCH(&r[1], 0, TGSI_CHAN_Y);
   1973       FETCH(&r[2], 0, TGSI_CHAN_Z);
   1974       FETCH(&r[3], 0, TGSI_CHAN_W);
   1975 
   1976       fetch_texel(mach->Samplers[unit],
   1977                   &r[0], &r[1], &r[2], &r[3],
   1978                   tgsi_sampler_lod_bias,
   1979                   &r[0], &r[1], &r[2], &r[3]);
   1980       break;
   1981 
   1982    default:
   1983       assert(0);
   1984    }
   1985 
   1986    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   1987       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   1988          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   1989       }
   1990    }
   1991 }
   1992 
   1993 
   1994 static void
   1995 exec_txf(struct tgsi_exec_machine *mach,
   1996 	 const struct tgsi_full_instruction *inst)
   1997 {
   1998    struct tgsi_sampler *sampler;
   1999    const uint unit = inst->Src[2].Register.Index;
   2000    union tgsi_exec_channel r[4];
   2001    union tgsi_exec_channel offset[3];
   2002    uint chan;
   2003    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
   2004    int j;
   2005    int8_t offsets[3];
   2006 
   2007    if (inst->Texture.NumOffsets == 1) {
   2008       union tgsi_exec_channel index;
   2009       index.i[0] = index.i[1] = index.i[2] = index.i[3] = inst->TexOffsets[0].Index;
   2010       fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
   2011                              inst->TexOffsets[0].SwizzleX, &index, &ZeroVec, &offset[0]);
   2012       fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
   2013                              inst->TexOffsets[0].SwizzleY, &index, &ZeroVec, &offset[1]);
   2014       fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
   2015                              inst->TexOffsets[0].SwizzleZ, &index, &ZeroVec, &offset[2]);
   2016      offsets[0] = offset[0].i[0];
   2017      offsets[1] = offset[1].i[0];
   2018      offsets[2] = offset[2].i[0];
   2019    } else
   2020      offsets[0] = offsets[1] = offsets[2] = 0;
   2021 
   2022    IFETCH(&r[3], 0, TGSI_CHAN_W);
   2023 
   2024    switch(inst->Texture.Texture) {
   2025    case TGSI_TEXTURE_3D:
   2026    case TGSI_TEXTURE_2D_ARRAY:
   2027    case TGSI_TEXTURE_SHADOW2D_ARRAY:
   2028       IFETCH(&r[2], 0, TGSI_CHAN_Z);
   2029       /* fallthrough */
   2030    case TGSI_TEXTURE_2D:
   2031    case TGSI_TEXTURE_RECT:
   2032    case TGSI_TEXTURE_SHADOW1D_ARRAY:
   2033    case TGSI_TEXTURE_SHADOW2D:
   2034    case TGSI_TEXTURE_SHADOWRECT:
   2035    case TGSI_TEXTURE_1D_ARRAY:
   2036       IFETCH(&r[1], 0, TGSI_CHAN_Y);
   2037       /* fallthrough */
   2038    case TGSI_TEXTURE_1D:
   2039    case TGSI_TEXTURE_SHADOW1D:
   2040       IFETCH(&r[0], 0, TGSI_CHAN_X);
   2041       break;
   2042    default:
   2043       assert(0);
   2044       break;
   2045    }
   2046 
   2047    sampler = mach->Samplers[unit];
   2048    sampler->get_texel(sampler, r[0].i, r[1].i, r[2].i, r[3].i,
   2049 		      offsets, rgba);
   2050 
   2051    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
   2052       r[0].f[j] = rgba[0][j];
   2053       r[1].f[j] = rgba[1][j];
   2054       r[2].f[j] = rgba[2][j];
   2055       r[3].f[j] = rgba[3][j];
   2056    }
   2057 
   2058    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2059       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2060          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   2061       }
   2062    }
   2063 }
   2064 
   2065 static void
   2066 exec_txq(struct tgsi_exec_machine *mach,
   2067          const struct tgsi_full_instruction *inst)
   2068 {
   2069    struct tgsi_sampler *sampler;
   2070    const uint unit = inst->Src[1].Register.Index;
   2071    int result[4];
   2072    union tgsi_exec_channel r[4], src;
   2073    uint chan;
   2074    int i,j;
   2075 
   2076    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
   2077    sampler = mach->Samplers[unit];
   2078 
   2079    sampler->get_dims(sampler, src.i[0], result);
   2080 
   2081    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   2082       for (j = 0; j < 4; j++) {
   2083 	 r[j].i[i] = result[j];
   2084       }
   2085    }
   2086 
   2087    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2088       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2089 	 store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
   2090 		    TGSI_EXEC_DATA_INT);
   2091       }
   2092    }
   2093 }
   2094 
   2095 static void
   2096 exec_sample(struct tgsi_exec_machine *mach,
   2097             const struct tgsi_full_instruction *inst,
   2098             uint modifier)
   2099 {
   2100    const uint resource_unit = inst->Src[1].Register.Index;
   2101    const uint sampler_unit = inst->Src[2].Register.Index;
   2102    union tgsi_exec_channel r[4];
   2103    const union tgsi_exec_channel *lod = &ZeroVec;
   2104    enum tgsi_sampler_control control;
   2105    uint chan;
   2106 
   2107    if (modifier != TEX_MODIFIER_NONE) {
   2108       if (modifier == TEX_MODIFIER_LOD_BIAS)
   2109          FETCH(&r[3], 3, TGSI_CHAN_X);
   2110       else /*TEX_MODIFIER_LOD*/
   2111          FETCH(&r[3], 0, TGSI_CHAN_W);
   2112 
   2113       if (modifier != TEX_MODIFIER_PROJECTED) {
   2114          lod = &r[3];
   2115       }
   2116    }
   2117 
   2118    if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
   2119       control = tgsi_sampler_lod_explicit;
   2120    } else {
   2121       control = tgsi_sampler_lod_bias;
   2122    }
   2123 
   2124    switch (mach->SamplerViews[resource_unit].Resource) {
   2125    case TGSI_TEXTURE_1D:
   2126    case TGSI_TEXTURE_SHADOW1D:
   2127       FETCH(&r[0], 0, TGSI_CHAN_X);
   2128 
   2129       if (modifier == TEX_MODIFIER_PROJECTED) {
   2130          micro_div(&r[0], &r[0], &r[3]);
   2131       }
   2132 
   2133       fetch_texel(mach->Samplers[sampler_unit],
   2134                   &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
   2135                   control,
   2136                   &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
   2137       break;
   2138 
   2139    case TGSI_TEXTURE_1D_ARRAY:
   2140    case TGSI_TEXTURE_2D:
   2141    case TGSI_TEXTURE_RECT:
   2142    case TGSI_TEXTURE_SHADOW1D_ARRAY:
   2143    case TGSI_TEXTURE_SHADOW2D:
   2144    case TGSI_TEXTURE_SHADOWRECT:
   2145       FETCH(&r[0], 0, TGSI_CHAN_X);
   2146       FETCH(&r[1], 0, TGSI_CHAN_Y);
   2147       FETCH(&r[2], 0, TGSI_CHAN_Z);
   2148 
   2149       if (modifier == TEX_MODIFIER_PROJECTED) {
   2150          micro_div(&r[0], &r[0], &r[3]);
   2151          micro_div(&r[1], &r[1], &r[3]);
   2152          micro_div(&r[2], &r[2], &r[3]);
   2153       }
   2154 
   2155       fetch_texel(mach->Samplers[sampler_unit],
   2156                   &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
   2157                   control,
   2158                   &r[0], &r[1], &r[2], &r[3]);  /* outputs */
   2159       break;
   2160 
   2161    case TGSI_TEXTURE_2D_ARRAY:
   2162    case TGSI_TEXTURE_3D:
   2163    case TGSI_TEXTURE_CUBE:
   2164       FETCH(&r[0], 0, TGSI_CHAN_X);
   2165       FETCH(&r[1], 0, TGSI_CHAN_Y);
   2166       FETCH(&r[2], 0, TGSI_CHAN_Z);
   2167 
   2168       if (modifier == TEX_MODIFIER_PROJECTED) {
   2169          micro_div(&r[0], &r[0], &r[3]);
   2170          micro_div(&r[1], &r[1], &r[3]);
   2171          micro_div(&r[2], &r[2], &r[3]);
   2172       }
   2173 
   2174       fetch_texel(mach->Samplers[sampler_unit],
   2175                   &r[0], &r[1], &r[2], lod,
   2176                   control,
   2177                   &r[0], &r[1], &r[2], &r[3]);
   2178       break;
   2179 
   2180    case TGSI_TEXTURE_SHADOW2D_ARRAY:
   2181    case TGSI_TEXTURE_SHADOWCUBE:
   2182       FETCH(&r[0], 0, TGSI_CHAN_X);
   2183       FETCH(&r[1], 0, TGSI_CHAN_Y);
   2184       FETCH(&r[2], 0, TGSI_CHAN_Z);
   2185       FETCH(&r[3], 0, TGSI_CHAN_W);
   2186 
   2187       assert(modifier != TEX_MODIFIER_PROJECTED);
   2188 
   2189       fetch_texel(mach->Samplers[sampler_unit],
   2190                   &r[0], &r[1], &r[2], &r[3],
   2191                   control,
   2192                   &r[0], &r[1], &r[2], &r[3]);
   2193       break;
   2194 
   2195    default:
   2196       assert(0);
   2197    }
   2198 
   2199    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2200       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2201          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   2202       }
   2203    }
   2204 }
   2205 
   2206 static void
   2207 exec_sample_d(struct tgsi_exec_machine *mach,
   2208               const struct tgsi_full_instruction *inst)
   2209 {
   2210    const uint resource_unit = inst->Src[1].Register.Index;
   2211    const uint sampler_unit = inst->Src[2].Register.Index;
   2212    union tgsi_exec_channel r[4];
   2213    uint chan;
   2214    /*
   2215     * XXX: This is fake SAMPLE_D -- the derivatives are not taken into account, yet.
   2216     */
   2217 
   2218    switch (mach->SamplerViews[resource_unit].Resource) {
   2219    case TGSI_TEXTURE_1D:
   2220    case TGSI_TEXTURE_SHADOW1D:
   2221 
   2222       FETCH(&r[0], 0, TGSI_CHAN_X);
   2223 
   2224       fetch_texel(mach->Samplers[sampler_unit],
   2225                   &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
   2226                   tgsi_sampler_lod_bias,
   2227                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
   2228       break;
   2229 
   2230    case TGSI_TEXTURE_2D:
   2231    case TGSI_TEXTURE_RECT:
   2232    case TGSI_TEXTURE_SHADOW2D:
   2233    case TGSI_TEXTURE_SHADOWRECT:
   2234 
   2235       FETCH(&r[0], 0, TGSI_CHAN_X);
   2236       FETCH(&r[1], 0, TGSI_CHAN_Y);
   2237       FETCH(&r[2], 0, TGSI_CHAN_Z);
   2238 
   2239       fetch_texel(mach->Samplers[sampler_unit],
   2240                   &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
   2241                   tgsi_sampler_lod_bias,
   2242                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
   2243       break;
   2244 
   2245    case TGSI_TEXTURE_3D:
   2246    case TGSI_TEXTURE_CUBE:
   2247 
   2248       FETCH(&r[0], 0, TGSI_CHAN_X);
   2249       FETCH(&r[1], 0, TGSI_CHAN_Y);
   2250       FETCH(&r[2], 0, TGSI_CHAN_Z);
   2251 
   2252       fetch_texel(mach->Samplers[sampler_unit],
   2253                   &r[0], &r[1], &r[2], &ZeroVec,
   2254                   tgsi_sampler_lod_bias,
   2255                   &r[0], &r[1], &r[2], &r[3]);
   2256       break;
   2257 
   2258    default:
   2259       assert(0);
   2260    }
   2261 
   2262    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2263       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2264          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   2265       }
   2266    }
   2267 }
   2268 
   2269 
   2270 /**
   2271  * Evaluate a constant-valued coefficient at the position of the
   2272  * current quad.
   2273  */
   2274 static void
   2275 eval_constant_coef(
   2276    struct tgsi_exec_machine *mach,
   2277    unsigned attrib,
   2278    unsigned chan )
   2279 {
   2280    unsigned i;
   2281 
   2282    for( i = 0; i < TGSI_QUAD_SIZE; i++ ) {
   2283       mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
   2284    }
   2285 }
   2286 
   2287 /**
   2288  * Evaluate a linear-valued coefficient at the position of the
   2289  * current quad.
   2290  */
   2291 static void
   2292 eval_linear_coef(
   2293    struct tgsi_exec_machine *mach,
   2294    unsigned attrib,
   2295    unsigned chan )
   2296 {
   2297    const float x = mach->QuadPos.xyzw[0].f[0];
   2298    const float y = mach->QuadPos.xyzw[1].f[0];
   2299    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
   2300    const float dady = mach->InterpCoefs[attrib].dady[chan];
   2301    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
   2302    mach->Inputs[attrib].xyzw[chan].f[0] = a0;
   2303    mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
   2304    mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
   2305    mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
   2306 }
   2307 
   2308 /**
   2309  * Evaluate a perspective-valued coefficient at the position of the
   2310  * current quad.
   2311  */
   2312 static void
   2313 eval_perspective_coef(
   2314    struct tgsi_exec_machine *mach,
   2315    unsigned attrib,
   2316    unsigned chan )
   2317 {
   2318    const float x = mach->QuadPos.xyzw[0].f[0];
   2319    const float y = mach->QuadPos.xyzw[1].f[0];
   2320    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
   2321    const float dady = mach->InterpCoefs[attrib].dady[chan];
   2322    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
   2323    const float *w = mach->QuadPos.xyzw[3].f;
   2324    /* divide by W here */
   2325    mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
   2326    mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
   2327    mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
   2328    mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
   2329 }
   2330 
   2331 
   2332 typedef void (* eval_coef_func)(
   2333    struct tgsi_exec_machine *mach,
   2334    unsigned attrib,
   2335    unsigned chan );
   2336 
   2337 static void
   2338 exec_declaration(struct tgsi_exec_machine *mach,
   2339                  const struct tgsi_full_declaration *decl)
   2340 {
   2341    if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
   2342       mach->SamplerViews[decl->Range.First] = decl->SamplerView;
   2343       return;
   2344    }
   2345 
   2346    if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
   2347       if (decl->Declaration.File == TGSI_FILE_INPUT) {
   2348          uint first, last, mask;
   2349 
   2350          first = decl->Range.First;
   2351          last = decl->Range.Last;
   2352          mask = decl->Declaration.UsageMask;
   2353 
   2354          /* XXX we could remove this special-case code since
   2355           * mach->InterpCoefs[first].a0 should already have the
   2356           * front/back-face value.  But we should first update the
   2357           * ureg code to emit the right UsageMask value (WRITEMASK_X).
   2358           * Then, we could remove the tgsi_exec_machine::Face field.
   2359           */
   2360          /* XXX make FACE a system value */
   2361          if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
   2362             uint i;
   2363 
   2364             assert(decl->Semantic.Index == 0);
   2365             assert(first == last);
   2366 
   2367             for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   2368                mach->Inputs[first].xyzw[0].f[i] = mach->Face;
   2369             }
   2370          } else {
   2371             eval_coef_func eval;
   2372             uint i, j;
   2373 
   2374             switch (decl->Interp.Interpolate) {
   2375             case TGSI_INTERPOLATE_CONSTANT:
   2376                eval = eval_constant_coef;
   2377                break;
   2378 
   2379             case TGSI_INTERPOLATE_LINEAR:
   2380                eval = eval_linear_coef;
   2381                break;
   2382 
   2383             case TGSI_INTERPOLATE_PERSPECTIVE:
   2384                eval = eval_perspective_coef;
   2385                break;
   2386 
   2387             case TGSI_INTERPOLATE_COLOR:
   2388                eval = mach->flatshade_color ? eval_constant_coef : eval_perspective_coef;
   2389                break;
   2390 
   2391             default:
   2392                assert(0);
   2393                return;
   2394             }
   2395 
   2396             for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
   2397                if (mask & (1 << j)) {
   2398                   for (i = first; i <= last; i++) {
   2399                      eval(mach, i, j);
   2400                   }
   2401                }
   2402             }
   2403          }
   2404       }
   2405    }
   2406 
   2407    if (decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
   2408       mach->SysSemanticToIndex[decl->Declaration.Semantic] = decl->Range.First;
   2409    }
   2410 }
   2411 
   2412 
   2413 typedef void (* micro_op)(union tgsi_exec_channel *dst);
   2414 
   2415 static void
   2416 exec_vector(struct tgsi_exec_machine *mach,
   2417             const struct tgsi_full_instruction *inst,
   2418             micro_op op,
   2419             enum tgsi_exec_datatype dst_datatype)
   2420 {
   2421    unsigned int chan;
   2422 
   2423    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2424       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2425          union tgsi_exec_channel dst;
   2426 
   2427          op(&dst);
   2428          store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
   2429       }
   2430    }
   2431 }
   2432 
   2433 typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
   2434                                 const union tgsi_exec_channel *src);
   2435 
   2436 static void
   2437 exec_scalar_unary(struct tgsi_exec_machine *mach,
   2438                   const struct tgsi_full_instruction *inst,
   2439                   micro_unary_op op,
   2440                   enum tgsi_exec_datatype dst_datatype,
   2441                   enum tgsi_exec_datatype src_datatype)
   2442 {
   2443    unsigned int chan;
   2444    union tgsi_exec_channel src;
   2445    union tgsi_exec_channel dst;
   2446 
   2447    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, src_datatype);
   2448    op(&dst, &src);
   2449    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2450       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2451          store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
   2452       }
   2453    }
   2454 }
   2455 
   2456 static void
   2457 exec_vector_unary(struct tgsi_exec_machine *mach,
   2458                   const struct tgsi_full_instruction *inst,
   2459                   micro_unary_op op,
   2460                   enum tgsi_exec_datatype dst_datatype,
   2461                   enum tgsi_exec_datatype src_datatype)
   2462 {
   2463    unsigned int chan;
   2464    struct tgsi_exec_vector dst;
   2465 
   2466    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2467       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2468          union tgsi_exec_channel src;
   2469 
   2470          fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
   2471          op(&dst.xyzw[chan], &src);
   2472       }
   2473    }
   2474    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2475       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2476          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
   2477       }
   2478    }
   2479 }
   2480 
   2481 typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
   2482                                  const union tgsi_exec_channel *src0,
   2483                                  const union tgsi_exec_channel *src1);
   2484 
   2485 static void
   2486 exec_scalar_binary(struct tgsi_exec_machine *mach,
   2487                    const struct tgsi_full_instruction *inst,
   2488                    micro_binary_op op,
   2489                    enum tgsi_exec_datatype dst_datatype,
   2490                    enum tgsi_exec_datatype src_datatype)
   2491 {
   2492    unsigned int chan;
   2493    union tgsi_exec_channel src[2];
   2494    union tgsi_exec_channel dst;
   2495 
   2496    fetch_source(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, src_datatype);
   2497    fetch_source(mach, &src[1], &inst->Src[1], TGSI_CHAN_Y, src_datatype);
   2498    op(&dst, &src[0], &src[1]);
   2499    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2500       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2501          store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
   2502       }
   2503    }
   2504 }
   2505 
   2506 static void
   2507 exec_vector_binary(struct tgsi_exec_machine *mach,
   2508                    const struct tgsi_full_instruction *inst,
   2509                    micro_binary_op op,
   2510                    enum tgsi_exec_datatype dst_datatype,
   2511                    enum tgsi_exec_datatype src_datatype)
   2512 {
   2513    unsigned int chan;
   2514    struct tgsi_exec_vector dst;
   2515 
   2516    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2517       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2518          union tgsi_exec_channel src[2];
   2519 
   2520          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
   2521          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
   2522          op(&dst.xyzw[chan], &src[0], &src[1]);
   2523       }
   2524    }
   2525    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2526       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2527          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
   2528       }
   2529    }
   2530 }
   2531 
   2532 typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
   2533                                   const union tgsi_exec_channel *src0,
   2534                                   const union tgsi_exec_channel *src1,
   2535                                   const union tgsi_exec_channel *src2);
   2536 
   2537 static void
   2538 exec_vector_trinary(struct tgsi_exec_machine *mach,
   2539                     const struct tgsi_full_instruction *inst,
   2540                     micro_trinary_op op,
   2541                     enum tgsi_exec_datatype dst_datatype,
   2542                     enum tgsi_exec_datatype src_datatype)
   2543 {
   2544    unsigned int chan;
   2545    struct tgsi_exec_vector dst;
   2546 
   2547    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2548       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2549          union tgsi_exec_channel src[3];
   2550 
   2551          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
   2552          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
   2553          fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
   2554          op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
   2555       }
   2556    }
   2557    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2558       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2559          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
   2560       }
   2561    }
   2562 }
   2563 
   2564 static void
   2565 exec_dp3(struct tgsi_exec_machine *mach,
   2566          const struct tgsi_full_instruction *inst)
   2567 {
   2568    unsigned int chan;
   2569    union tgsi_exec_channel arg[3];
   2570 
   2571    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2572    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2573    micro_mul(&arg[2], &arg[0], &arg[1]);
   2574 
   2575    for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_Z; chan++) {
   2576       fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
   2577       fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
   2578       micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
   2579    }
   2580 
   2581    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2582       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2583          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   2584       }
   2585    }
   2586 }
   2587 
   2588 static void
   2589 exec_dp4(struct tgsi_exec_machine *mach,
   2590          const struct tgsi_full_instruction *inst)
   2591 {
   2592    unsigned int chan;
   2593    union tgsi_exec_channel arg[3];
   2594 
   2595    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2596    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2597    micro_mul(&arg[2], &arg[0], &arg[1]);
   2598 
   2599    for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_W; chan++) {
   2600       fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
   2601       fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
   2602       micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
   2603    }
   2604 
   2605    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2606       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2607          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   2608       }
   2609    }
   2610 }
   2611 
   2612 static void
   2613 exec_dp2a(struct tgsi_exec_machine *mach,
   2614           const struct tgsi_full_instruction *inst)
   2615 {
   2616    unsigned int chan;
   2617    union tgsi_exec_channel arg[3];
   2618 
   2619    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2620    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2621    micro_mul(&arg[2], &arg[0], &arg[1]);
   2622 
   2623    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   2624    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   2625    micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
   2626 
   2627    fetch_source(mach, &arg[1], &inst->Src[2], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2628    micro_add(&arg[0], &arg[0], &arg[1]);
   2629 
   2630    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2631       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2632          store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   2633       }
   2634    }
   2635 }
   2636 
   2637 static void
   2638 exec_dph(struct tgsi_exec_machine *mach,
   2639          const struct tgsi_full_instruction *inst)
   2640 {
   2641    unsigned int chan;
   2642    union tgsi_exec_channel arg[3];
   2643 
   2644    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2645    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2646    micro_mul(&arg[2], &arg[0], &arg[1]);
   2647 
   2648    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   2649    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   2650    micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
   2651 
   2652    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   2653    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   2654    micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
   2655 
   2656    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   2657    micro_add(&arg[0], &arg[0], &arg[1]);
   2658 
   2659    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2660       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2661          store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   2662       }
   2663    }
   2664 }
   2665 
   2666 static void
   2667 exec_dp2(struct tgsi_exec_machine *mach,
   2668          const struct tgsi_full_instruction *inst)
   2669 {
   2670    unsigned int chan;
   2671    union tgsi_exec_channel arg[3];
   2672 
   2673    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2674    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2675    micro_mul(&arg[2], &arg[0], &arg[1]);
   2676 
   2677    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   2678    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   2679    micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
   2680 
   2681    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2682       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2683          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   2684       }
   2685    }
   2686 }
   2687 
   2688 static void
   2689 exec_nrm4(struct tgsi_exec_machine *mach,
   2690           const struct tgsi_full_instruction *inst)
   2691 {
   2692    unsigned int chan;
   2693    union tgsi_exec_channel arg[4];
   2694    union tgsi_exec_channel scale;
   2695 
   2696    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2697    micro_mul(&scale, &arg[0], &arg[0]);
   2698 
   2699    for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_W; chan++) {
   2700       union tgsi_exec_channel product;
   2701 
   2702       fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
   2703       micro_mul(&product, &arg[chan], &arg[chan]);
   2704       micro_add(&scale, &scale, &product);
   2705    }
   2706 
   2707    micro_rsq(&scale, &scale);
   2708 
   2709    for (chan = TGSI_CHAN_X; chan <= TGSI_CHAN_W; chan++) {
   2710       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2711          micro_mul(&arg[chan], &arg[chan], &scale);
   2712          store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   2713       }
   2714    }
   2715 }
   2716 
   2717 static void
   2718 exec_nrm3(struct tgsi_exec_machine *mach,
   2719           const struct tgsi_full_instruction *inst)
   2720 {
   2721    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
   2722       unsigned int chan;
   2723       union tgsi_exec_channel arg[3];
   2724       union tgsi_exec_channel scale;
   2725 
   2726       fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2727       micro_mul(&scale, &arg[0], &arg[0]);
   2728 
   2729       for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_Z; chan++) {
   2730          union tgsi_exec_channel product;
   2731 
   2732          fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
   2733          micro_mul(&product, &arg[chan], &arg[chan]);
   2734          micro_add(&scale, &scale, &product);
   2735       }
   2736 
   2737       micro_rsq(&scale, &scale);
   2738 
   2739       for (chan = TGSI_CHAN_X; chan <= TGSI_CHAN_Z; chan++) {
   2740          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2741             micro_mul(&arg[chan], &arg[chan], &scale);
   2742             store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   2743          }
   2744       }
   2745    }
   2746 
   2747    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
   2748       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   2749    }
   2750 }
   2751 
   2752 static void
   2753 exec_scs(struct tgsi_exec_machine *mach,
   2754          const struct tgsi_full_instruction *inst)
   2755 {
   2756    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) {
   2757       union tgsi_exec_channel arg;
   2758       union tgsi_exec_channel result;
   2759 
   2760       fetch_source(mach, &arg, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2761 
   2762       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
   2763          micro_cos(&result, &arg);
   2764          store_dest(mach, &result, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2765       }
   2766       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
   2767          micro_sin(&result, &arg);
   2768          store_dest(mach, &result, &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   2769       }
   2770    }
   2771    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
   2772       store_dest(mach, &ZeroVec, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   2773    }
   2774    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
   2775       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   2776    }
   2777 }
   2778 
   2779 static void
   2780 exec_x2d(struct tgsi_exec_machine *mach,
   2781          const struct tgsi_full_instruction *inst)
   2782 {
   2783    union tgsi_exec_channel r[4];
   2784    union tgsi_exec_channel d[2];
   2785 
   2786    fetch_source(mach, &r[0], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2787    fetch_source(mach, &r[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   2788    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XZ) {
   2789       fetch_source(mach, &r[2], &inst->Src[2], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2790       micro_mul(&r[2], &r[2], &r[0]);
   2791       fetch_source(mach, &r[3], &inst->Src[2], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   2792       micro_mul(&r[3], &r[3], &r[1]);
   2793       micro_add(&r[2], &r[2], &r[3]);
   2794       fetch_source(mach, &r[3], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2795       micro_add(&d[0], &r[2], &r[3]);
   2796    }
   2797    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YW) {
   2798       fetch_source(mach, &r[2], &inst->Src[2], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   2799       micro_mul(&r[2], &r[2], &r[0]);
   2800       fetch_source(mach, &r[3], &inst->Src[2], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   2801       micro_mul(&r[3], &r[3], &r[1]);
   2802       micro_add(&r[2], &r[2], &r[3]);
   2803       fetch_source(mach, &r[3], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   2804       micro_add(&d[1], &r[2], &r[3]);
   2805    }
   2806    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
   2807       store_dest(mach, &d[0], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2808    }
   2809    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
   2810       store_dest(mach, &d[1], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   2811    }
   2812    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
   2813       store_dest(mach, &d[0], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   2814    }
   2815    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
   2816       store_dest(mach, &d[1], &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   2817    }
   2818 }
   2819 
   2820 static void
   2821 exec_rfl(struct tgsi_exec_machine *mach,
   2822          const struct tgsi_full_instruction *inst)
   2823 {
   2824    union tgsi_exec_channel r[9];
   2825 
   2826    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
   2827       /* r0 = dp3(src0, src0) */
   2828       fetch_source(mach, &r[2], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2829       micro_mul(&r[0], &r[2], &r[2]);
   2830       fetch_source(mach, &r[4], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   2831       micro_mul(&r[8], &r[4], &r[4]);
   2832       micro_add(&r[0], &r[0], &r[8]);
   2833       fetch_source(mach, &r[6], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   2834       micro_mul(&r[8], &r[6], &r[6]);
   2835       micro_add(&r[0], &r[0], &r[8]);
   2836 
   2837       /* r1 = dp3(src0, src1) */
   2838       fetch_source(mach, &r[3], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2839       micro_mul(&r[1], &r[2], &r[3]);
   2840       fetch_source(mach, &r[5], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   2841       micro_mul(&r[8], &r[4], &r[5]);
   2842       micro_add(&r[1], &r[1], &r[8]);
   2843       fetch_source(mach, &r[7], &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   2844       micro_mul(&r[8], &r[6], &r[7]);
   2845       micro_add(&r[1], &r[1], &r[8]);
   2846 
   2847       /* r1 = 2 * r1 / r0 */
   2848       micro_add(&r[1], &r[1], &r[1]);
   2849       micro_div(&r[1], &r[1], &r[0]);
   2850 
   2851       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
   2852          micro_mul(&r[2], &r[2], &r[1]);
   2853          micro_sub(&r[2], &r[2], &r[3]);
   2854          store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2855       }
   2856       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
   2857          micro_mul(&r[4], &r[4], &r[1]);
   2858          micro_sub(&r[4], &r[4], &r[5]);
   2859          store_dest(mach, &r[4], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   2860       }
   2861       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
   2862          micro_mul(&r[6], &r[6], &r[1]);
   2863          micro_sub(&r[6], &r[6], &r[7]);
   2864          store_dest(mach, &r[6], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   2865       }
   2866    }
   2867    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
   2868       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   2869    }
   2870 }
   2871 
   2872 static void
   2873 exec_xpd(struct tgsi_exec_machine *mach,
   2874          const struct tgsi_full_instruction *inst)
   2875 {
   2876    union tgsi_exec_channel r[6];
   2877    union tgsi_exec_channel d[3];
   2878 
   2879    fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   2880    fetch_source(mach, &r[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   2881 
   2882    micro_mul(&r[2], &r[0], &r[1]);
   2883 
   2884    fetch_source(mach, &r[3], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   2885    fetch_source(mach, &r[4], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   2886 
   2887    micro_mul(&r[5], &r[3], &r[4] );
   2888    micro_sub(&d[TGSI_CHAN_X], &r[2], &r[5]);
   2889 
   2890    fetch_source(mach, &r[2], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2891 
   2892    micro_mul(&r[3], &r[3], &r[2]);
   2893 
   2894    fetch_source(mach, &r[5], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2895 
   2896    micro_mul(&r[1], &r[1], &r[5]);
   2897    micro_sub(&d[TGSI_CHAN_Y], &r[3], &r[1]);
   2898 
   2899    micro_mul(&r[5], &r[5], &r[4]);
   2900    micro_mul(&r[0], &r[0], &r[2]);
   2901    micro_sub(&d[TGSI_CHAN_Z], &r[5], &r[0]);
   2902 
   2903    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
   2904       store_dest(mach, &d[TGSI_CHAN_X], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2905    }
   2906    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
   2907       store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   2908    }
   2909    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
   2910       store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   2911    }
   2912    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
   2913       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   2914    }
   2915 }
   2916 
   2917 static void
   2918 exec_dst(struct tgsi_exec_machine *mach,
   2919          const struct tgsi_full_instruction *inst)
   2920 {
   2921    union tgsi_exec_channel r[2];
   2922    union tgsi_exec_channel d[4];
   2923 
   2924    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
   2925       fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   2926       fetch_source(mach, &r[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   2927       micro_mul(&d[TGSI_CHAN_Y], &r[0], &r[1]);
   2928    }
   2929    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
   2930       fetch_source(mach, &d[TGSI_CHAN_Z], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   2931    }
   2932    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
   2933       fetch_source(mach, &d[TGSI_CHAN_W], &inst->Src[1], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   2934    }
   2935 
   2936    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
   2937       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2938    }
   2939    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
   2940       store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   2941    }
   2942    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
   2943       store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   2944    }
   2945    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
   2946       store_dest(mach, &d[TGSI_CHAN_W], &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   2947    }
   2948 }
   2949 
   2950 static void
   2951 exec_log(struct tgsi_exec_machine *mach,
   2952          const struct tgsi_full_instruction *inst)
   2953 {
   2954    union tgsi_exec_channel r[3];
   2955 
   2956    fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2957    micro_abs(&r[2], &r[0]);  /* r2 = abs(r0) */
   2958    micro_lg2(&r[1], &r[2]);  /* r1 = lg2(r2) */
   2959    micro_flr(&r[0], &r[1]);  /* r0 = floor(r1) */
   2960    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
   2961       store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2962    }
   2963    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
   2964       micro_exp2(&r[0], &r[0]);       /* r0 = 2 ^ r0 */
   2965       micro_div(&r[0], &r[2], &r[0]); /* r0 = r2 / r0 */
   2966       store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   2967    }
   2968    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
   2969       store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   2970    }
   2971    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
   2972       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   2973    }
   2974 }
   2975 
   2976 static void
   2977 exec_exp(struct tgsi_exec_machine *mach,
   2978          const struct tgsi_full_instruction *inst)
   2979 {
   2980    union tgsi_exec_channel r[3];
   2981 
   2982    fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2983    micro_flr(&r[1], &r[0]);  /* r1 = floor(r0) */
   2984    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
   2985       micro_exp2(&r[2], &r[1]);       /* r2 = 2 ^ r1 */
   2986       store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   2987    }
   2988    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
   2989       micro_sub(&r[2], &r[0], &r[1]); /* r2 = r0 - r1 */
   2990       store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   2991    }
   2992    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
   2993       micro_exp2(&r[2], &r[0]);       /* r2 = 2 ^ r0 */
   2994       store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   2995    }
   2996    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
   2997       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   2998    }
   2999 }
   3000 
   3001 static void
   3002 exec_lit(struct tgsi_exec_machine *mach,
   3003          const struct tgsi_full_instruction *inst)
   3004 {
   3005    union tgsi_exec_channel r[3];
   3006    union tgsi_exec_channel d[3];
   3007 
   3008    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YZ) {
   3009       fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3010       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
   3011          fetch_source(mach, &r[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3012          micro_max(&r[1], &r[1], &ZeroVec);
   3013 
   3014          fetch_source(mach, &r[2], &inst->Src[0], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   3015          micro_min(&r[2], &r[2], &P128Vec);
   3016          micro_max(&r[2], &r[2], &M128Vec);
   3017          micro_pow(&r[1], &r[1], &r[2]);
   3018          micro_lt(&d[TGSI_CHAN_Z], &ZeroVec, &r[0], &r[1], &ZeroVec);
   3019          store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   3020       }
   3021       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
   3022          micro_max(&d[TGSI_CHAN_Y], &r[0], &ZeroVec);
   3023          store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3024       }
   3025    }
   3026    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
   3027       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3028    }
   3029 
   3030    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
   3031       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   3032    }
   3033 }
   3034 
   3035 static void
   3036 exec_break(struct tgsi_exec_machine *mach)
   3037 {
   3038    if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
   3039       /* turn off loop channels for each enabled exec channel */
   3040       mach->LoopMask &= ~mach->ExecMask;
   3041       /* Todo: if mach->LoopMask == 0, jump to end of loop */
   3042       UPDATE_EXEC_MASK(mach);
   3043    } else {
   3044       assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
   3045 
   3046       mach->Switch.mask = 0x0;
   3047 
   3048       UPDATE_EXEC_MASK(mach);
   3049    }
   3050 }
   3051 
   3052 static void
   3053 exec_switch(struct tgsi_exec_machine *mach,
   3054             const struct tgsi_full_instruction *inst)
   3055 {
   3056    assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
   3057    assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
   3058 
   3059    mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
   3060    fetch_source(mach, &mach->Switch.selector, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
   3061    mach->Switch.mask = 0x0;
   3062    mach->Switch.defaultMask = 0x0;
   3063 
   3064    mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
   3065    mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
   3066 
   3067    UPDATE_EXEC_MASK(mach);
   3068 }
   3069 
   3070 static void
   3071 exec_case(struct tgsi_exec_machine *mach,
   3072           const struct tgsi_full_instruction *inst)
   3073 {
   3074    uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
   3075    union tgsi_exec_channel src;
   3076    uint mask = 0;
   3077 
   3078    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
   3079 
   3080    if (mach->Switch.selector.u[0] == src.u[0]) {
   3081       mask |= 0x1;
   3082    }
   3083    if (mach->Switch.selector.u[1] == src.u[1]) {
   3084       mask |= 0x2;
   3085    }
   3086    if (mach->Switch.selector.u[2] == src.u[2]) {
   3087       mask |= 0x4;
   3088    }
   3089    if (mach->Switch.selector.u[3] == src.u[3]) {
   3090       mask |= 0x8;
   3091    }
   3092 
   3093    mach->Switch.defaultMask |= mask;
   3094 
   3095    mach->Switch.mask |= mask & prevMask;
   3096 
   3097    UPDATE_EXEC_MASK(mach);
   3098 }
   3099 
   3100 static void
   3101 exec_default(struct tgsi_exec_machine *mach)
   3102 {
   3103    uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
   3104 
   3105    mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
   3106 
   3107    UPDATE_EXEC_MASK(mach);
   3108 }
   3109 
   3110 static void
   3111 exec_endswitch(struct tgsi_exec_machine *mach)
   3112 {
   3113    mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
   3114    mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
   3115 
   3116    UPDATE_EXEC_MASK(mach);
   3117 }
   3118 
   3119 static void
   3120 micro_i2f(union tgsi_exec_channel *dst,
   3121           const union tgsi_exec_channel *src)
   3122 {
   3123    dst->f[0] = (float)src->i[0];
   3124    dst->f[1] = (float)src->i[1];
   3125    dst->f[2] = (float)src->i[2];
   3126    dst->f[3] = (float)src->i[3];
   3127 }
   3128 
   3129 static void
   3130 micro_not(union tgsi_exec_channel *dst,
   3131           const union tgsi_exec_channel *src)
   3132 {
   3133    dst->u[0] = ~src->u[0];
   3134    dst->u[1] = ~src->u[1];
   3135    dst->u[2] = ~src->u[2];
   3136    dst->u[3] = ~src->u[3];
   3137 }
   3138 
   3139 static void
   3140 micro_shl(union tgsi_exec_channel *dst,
   3141           const union tgsi_exec_channel *src0,
   3142           const union tgsi_exec_channel *src1)
   3143 {
   3144    dst->u[0] = src0->u[0] << src1->u[0];
   3145    dst->u[1] = src0->u[1] << src1->u[1];
   3146    dst->u[2] = src0->u[2] << src1->u[2];
   3147    dst->u[3] = src0->u[3] << src1->u[3];
   3148 }
   3149 
   3150 static void
   3151 micro_and(union tgsi_exec_channel *dst,
   3152           const union tgsi_exec_channel *src0,
   3153           const union tgsi_exec_channel *src1)
   3154 {
   3155    dst->u[0] = src0->u[0] & src1->u[0];
   3156    dst->u[1] = src0->u[1] & src1->u[1];
   3157    dst->u[2] = src0->u[2] & src1->u[2];
   3158    dst->u[3] = src0->u[3] & src1->u[3];
   3159 }
   3160 
   3161 static void
   3162 micro_or(union tgsi_exec_channel *dst,
   3163          const union tgsi_exec_channel *src0,
   3164          const union tgsi_exec_channel *src1)
   3165 {
   3166    dst->u[0] = src0->u[0] | src1->u[0];
   3167    dst->u[1] = src0->u[1] | src1->u[1];
   3168    dst->u[2] = src0->u[2] | src1->u[2];
   3169    dst->u[3] = src0->u[3] | src1->u[3];
   3170 }
   3171 
   3172 static void
   3173 micro_xor(union tgsi_exec_channel *dst,
   3174           const union tgsi_exec_channel *src0,
   3175           const union tgsi_exec_channel *src1)
   3176 {
   3177    dst->u[0] = src0->u[0] ^ src1->u[0];
   3178    dst->u[1] = src0->u[1] ^ src1->u[1];
   3179    dst->u[2] = src0->u[2] ^ src1->u[2];
   3180    dst->u[3] = src0->u[3] ^ src1->u[3];
   3181 }
   3182 
   3183 static void
   3184 micro_mod(union tgsi_exec_channel *dst,
   3185           const union tgsi_exec_channel *src0,
   3186           const union tgsi_exec_channel *src1)
   3187 {
   3188    dst->i[0] = src0->i[0] % src1->i[0];
   3189    dst->i[1] = src0->i[1] % src1->i[1];
   3190    dst->i[2] = src0->i[2] % src1->i[2];
   3191    dst->i[3] = src0->i[3] % src1->i[3];
   3192 }
   3193 
   3194 static void
   3195 micro_f2i(union tgsi_exec_channel *dst,
   3196           const union tgsi_exec_channel *src)
   3197 {
   3198    dst->i[0] = (int)src->f[0];
   3199    dst->i[1] = (int)src->f[1];
   3200    dst->i[2] = (int)src->f[2];
   3201    dst->i[3] = (int)src->f[3];
   3202 }
   3203 
   3204 static void
   3205 micro_idiv(union tgsi_exec_channel *dst,
   3206            const union tgsi_exec_channel *src0,
   3207            const union tgsi_exec_channel *src1)
   3208 {
   3209    dst->i[0] = src0->i[0] / src1->i[0];
   3210    dst->i[1] = src0->i[1] / src1->i[1];
   3211    dst->i[2] = src0->i[2] / src1->i[2];
   3212    dst->i[3] = src0->i[3] / src1->i[3];
   3213 }
   3214 
   3215 static void
   3216 micro_imax(union tgsi_exec_channel *dst,
   3217            const union tgsi_exec_channel *src0,
   3218            const union tgsi_exec_channel *src1)
   3219 {
   3220    dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
   3221    dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
   3222    dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
   3223    dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
   3224 }
   3225 
   3226 static void
   3227 micro_imin(union tgsi_exec_channel *dst,
   3228            const union tgsi_exec_channel *src0,
   3229            const union tgsi_exec_channel *src1)
   3230 {
   3231    dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
   3232    dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
   3233    dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
   3234    dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
   3235 }
   3236 
   3237 static void
   3238 micro_isge(union tgsi_exec_channel *dst,
   3239            const union tgsi_exec_channel *src0,
   3240            const union tgsi_exec_channel *src1)
   3241 {
   3242    dst->i[0] = src0->i[0] >= src1->i[0] ? -1 : 0;
   3243    dst->i[1] = src0->i[1] >= src1->i[1] ? -1 : 0;
   3244    dst->i[2] = src0->i[2] >= src1->i[2] ? -1 : 0;
   3245    dst->i[3] = src0->i[3] >= src1->i[3] ? -1 : 0;
   3246 }
   3247 
   3248 static void
   3249 micro_ishr(union tgsi_exec_channel *dst,
   3250            const union tgsi_exec_channel *src0,
   3251            const union tgsi_exec_channel *src1)
   3252 {
   3253    dst->i[0] = src0->i[0] >> src1->i[0];
   3254    dst->i[1] = src0->i[1] >> src1->i[1];
   3255    dst->i[2] = src0->i[2] >> src1->i[2];
   3256    dst->i[3] = src0->i[3] >> src1->i[3];
   3257 }
   3258 
   3259 static void
   3260 micro_islt(union tgsi_exec_channel *dst,
   3261            const union tgsi_exec_channel *src0,
   3262            const union tgsi_exec_channel *src1)
   3263 {
   3264    dst->i[0] = src0->i[0] < src1->i[0] ? -1 : 0;
   3265    dst->i[1] = src0->i[1] < src1->i[1] ? -1 : 0;
   3266    dst->i[2] = src0->i[2] < src1->i[2] ? -1 : 0;
   3267    dst->i[3] = src0->i[3] < src1->i[3] ? -1 : 0;
   3268 }
   3269 
   3270 static void
   3271 micro_f2u(union tgsi_exec_channel *dst,
   3272           const union tgsi_exec_channel *src)
   3273 {
   3274    dst->u[0] = (uint)src->f[0];
   3275    dst->u[1] = (uint)src->f[1];
   3276    dst->u[2] = (uint)src->f[2];
   3277    dst->u[3] = (uint)src->f[3];
   3278 }
   3279 
   3280 static void
   3281 micro_u2f(union tgsi_exec_channel *dst,
   3282           const union tgsi_exec_channel *src)
   3283 {
   3284    dst->f[0] = (float)src->u[0];
   3285    dst->f[1] = (float)src->u[1];
   3286    dst->f[2] = (float)src->u[2];
   3287    dst->f[3] = (float)src->u[3];
   3288 }
   3289 
   3290 static void
   3291 micro_uadd(union tgsi_exec_channel *dst,
   3292            const union tgsi_exec_channel *src0,
   3293            const union tgsi_exec_channel *src1)
   3294 {
   3295    dst->u[0] = src0->u[0] + src1->u[0];
   3296    dst->u[1] = src0->u[1] + src1->u[1];
   3297    dst->u[2] = src0->u[2] + src1->u[2];
   3298    dst->u[3] = src0->u[3] + src1->u[3];
   3299 }
   3300 
   3301 static void
   3302 micro_udiv(union tgsi_exec_channel *dst,
   3303            const union tgsi_exec_channel *src0,
   3304            const union tgsi_exec_channel *src1)
   3305 {
   3306    dst->u[0] = src0->u[0] / src1->u[0];
   3307    dst->u[1] = src0->u[1] / src1->u[1];
   3308    dst->u[2] = src0->u[2] / src1->u[2];
   3309    dst->u[3] = src0->u[3] / src1->u[3];
   3310 }
   3311 
   3312 static void
   3313 micro_umad(union tgsi_exec_channel *dst,
   3314            const union tgsi_exec_channel *src0,
   3315            const union tgsi_exec_channel *src1,
   3316            const union tgsi_exec_channel *src2)
   3317 {
   3318    dst->u[0] = src0->u[0] * src1->u[0] + src2->u[0];
   3319    dst->u[1] = src0->u[1] * src1->u[1] + src2->u[1];
   3320    dst->u[2] = src0->u[2] * src1->u[2] + src2->u[2];
   3321    dst->u[3] = src0->u[3] * src1->u[3] + src2->u[3];
   3322 }
   3323 
   3324 static void
   3325 micro_umax(union tgsi_exec_channel *dst,
   3326            const union tgsi_exec_channel *src0,
   3327            const union tgsi_exec_channel *src1)
   3328 {
   3329    dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
   3330    dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
   3331    dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
   3332    dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
   3333 }
   3334 
   3335 static void
   3336 micro_umin(union tgsi_exec_channel *dst,
   3337            const union tgsi_exec_channel *src0,
   3338            const union tgsi_exec_channel *src1)
   3339 {
   3340    dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
   3341    dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
   3342    dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
   3343    dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
   3344 }
   3345 
   3346 static void
   3347 micro_umod(union tgsi_exec_channel *dst,
   3348            const union tgsi_exec_channel *src0,
   3349            const union tgsi_exec_channel *src1)
   3350 {
   3351    dst->u[0] = src0->u[0] % src1->u[0];
   3352    dst->u[1] = src0->u[1] % src1->u[1];
   3353    dst->u[2] = src0->u[2] % src1->u[2];
   3354    dst->u[3] = src0->u[3] % src1->u[3];
   3355 }
   3356 
   3357 static void
   3358 micro_umul(union tgsi_exec_channel *dst,
   3359            const union tgsi_exec_channel *src0,
   3360            const union tgsi_exec_channel *src1)
   3361 {
   3362    dst->u[0] = src0->u[0] * src1->u[0];
   3363    dst->u[1] = src0->u[1] * src1->u[1];
   3364    dst->u[2] = src0->u[2] * src1->u[2];
   3365    dst->u[3] = src0->u[3] * src1->u[3];
   3366 }
   3367 
   3368 static void
   3369 micro_useq(union tgsi_exec_channel *dst,
   3370            const union tgsi_exec_channel *src0,
   3371            const union tgsi_exec_channel *src1)
   3372 {
   3373    dst->u[0] = src0->u[0] == src1->u[0] ? ~0 : 0;
   3374    dst->u[1] = src0->u[1] == src1->u[1] ? ~0 : 0;
   3375    dst->u[2] = src0->u[2] == src1->u[2] ? ~0 : 0;
   3376    dst->u[3] = src0->u[3] == src1->u[3] ? ~0 : 0;
   3377 }
   3378 
   3379 static void
   3380 micro_usge(union tgsi_exec_channel *dst,
   3381            const union tgsi_exec_channel *src0,
   3382            const union tgsi_exec_channel *src1)
   3383 {
   3384    dst->u[0] = src0->u[0] >= src1->u[0] ? ~0 : 0;
   3385    dst->u[1] = src0->u[1] >= src1->u[1] ? ~0 : 0;
   3386    dst->u[2] = src0->u[2] >= src1->u[2] ? ~0 : 0;
   3387    dst->u[3] = src0->u[3] >= src1->u[3] ? ~0 : 0;
   3388 }
   3389 
   3390 static void
   3391 micro_ushr(union tgsi_exec_channel *dst,
   3392            const union tgsi_exec_channel *src0,
   3393            const union tgsi_exec_channel *src1)
   3394 {
   3395    dst->u[0] = src0->u[0] >> src1->u[0];
   3396    dst->u[1] = src0->u[1] >> src1->u[1];
   3397    dst->u[2] = src0->u[2] >> src1->u[2];
   3398    dst->u[3] = src0->u[3] >> src1->u[3];
   3399 }
   3400 
   3401 static void
   3402 micro_uslt(union tgsi_exec_channel *dst,
   3403            const union tgsi_exec_channel *src0,
   3404            const union tgsi_exec_channel *src1)
   3405 {
   3406    dst->u[0] = src0->u[0] < src1->u[0] ? ~0 : 0;
   3407    dst->u[1] = src0->u[1] < src1->u[1] ? ~0 : 0;
   3408    dst->u[2] = src0->u[2] < src1->u[2] ? ~0 : 0;
   3409    dst->u[3] = src0->u[3] < src1->u[3] ? ~0 : 0;
   3410 }
   3411 
   3412 static void
   3413 micro_usne(union tgsi_exec_channel *dst,
   3414            const union tgsi_exec_channel *src0,
   3415            const union tgsi_exec_channel *src1)
   3416 {
   3417    dst->u[0] = src0->u[0] != src1->u[0] ? ~0 : 0;
   3418    dst->u[1] = src0->u[1] != src1->u[1] ? ~0 : 0;
   3419    dst->u[2] = src0->u[2] != src1->u[2] ? ~0 : 0;
   3420    dst->u[3] = src0->u[3] != src1->u[3] ? ~0 : 0;
   3421 }
   3422 
   3423 static void
   3424 micro_uarl(union tgsi_exec_channel *dst,
   3425            const union tgsi_exec_channel *src)
   3426 {
   3427    dst->i[0] = src->u[0];
   3428    dst->i[1] = src->u[1];
   3429    dst->i[2] = src->u[2];
   3430    dst->i[3] = src->u[3];
   3431 }
   3432 
   3433 static void
   3434 micro_ucmp(union tgsi_exec_channel *dst,
   3435            const union tgsi_exec_channel *src0,
   3436            const union tgsi_exec_channel *src1,
   3437            const union tgsi_exec_channel *src2)
   3438 {
   3439    dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
   3440    dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
   3441    dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
   3442    dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
   3443 }
   3444 
   3445 static void
   3446 exec_instruction(
   3447    struct tgsi_exec_machine *mach,
   3448    const struct tgsi_full_instruction *inst,
   3449    int *pc )
   3450 {
   3451    union tgsi_exec_channel r[10];
   3452 
   3453    (*pc)++;
   3454 
   3455    switch (inst->Instruction.Opcode) {
   3456    case TGSI_OPCODE_ARL:
   3457       exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
   3458       break;
   3459 
   3460    case TGSI_OPCODE_MOV:
   3461       exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
   3462       break;
   3463 
   3464    case TGSI_OPCODE_LIT:
   3465       exec_lit(mach, inst);
   3466       break;
   3467 
   3468    case TGSI_OPCODE_RCP:
   3469       exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3470       break;
   3471 
   3472    case TGSI_OPCODE_RSQ:
   3473       exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3474       break;
   3475 
   3476    case TGSI_OPCODE_EXP:
   3477       exec_exp(mach, inst);
   3478       break;
   3479 
   3480    case TGSI_OPCODE_LOG:
   3481       exec_log(mach, inst);
   3482       break;
   3483 
   3484    case TGSI_OPCODE_MUL:
   3485       exec_vector_binary(mach, inst, micro_mul, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3486       break;
   3487 
   3488    case TGSI_OPCODE_ADD:
   3489       exec_vector_binary(mach, inst, micro_add, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3490       break;
   3491 
   3492    case TGSI_OPCODE_DP3:
   3493       exec_dp3(mach, inst);
   3494       break;
   3495 
   3496    case TGSI_OPCODE_DP4:
   3497       exec_dp4(mach, inst);
   3498       break;
   3499 
   3500    case TGSI_OPCODE_DST:
   3501       exec_dst(mach, inst);
   3502       break;
   3503 
   3504    case TGSI_OPCODE_MIN:
   3505       exec_vector_binary(mach, inst, micro_min, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3506       break;
   3507 
   3508    case TGSI_OPCODE_MAX:
   3509       exec_vector_binary(mach, inst, micro_max, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3510       break;
   3511 
   3512    case TGSI_OPCODE_SLT:
   3513       exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3514       break;
   3515 
   3516    case TGSI_OPCODE_SGE:
   3517       exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3518       break;
   3519 
   3520    case TGSI_OPCODE_MAD:
   3521       exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3522       break;
   3523 
   3524    case TGSI_OPCODE_SUB:
   3525       exec_vector_binary(mach, inst, micro_sub, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3526       break;
   3527 
   3528    case TGSI_OPCODE_LRP:
   3529       exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3530       break;
   3531 
   3532    case TGSI_OPCODE_CND:
   3533       exec_vector_trinary(mach, inst, micro_cnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3534       break;
   3535 
   3536    case TGSI_OPCODE_DP2A:
   3537       exec_dp2a(mach, inst);
   3538       break;
   3539 
   3540    case TGSI_OPCODE_FRC:
   3541       exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3542       break;
   3543 
   3544    case TGSI_OPCODE_CLAMP:
   3545       exec_vector_trinary(mach, inst, micro_clamp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3546       break;
   3547 
   3548    case TGSI_OPCODE_FLR:
   3549       exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3550       break;
   3551 
   3552    case TGSI_OPCODE_ROUND:
   3553       exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3554       break;
   3555 
   3556    case TGSI_OPCODE_EX2:
   3557       exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3558       break;
   3559 
   3560    case TGSI_OPCODE_LG2:
   3561       exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3562       break;
   3563 
   3564    case TGSI_OPCODE_POW:
   3565       exec_scalar_binary(mach, inst, micro_pow, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3566       break;
   3567 
   3568    case TGSI_OPCODE_XPD:
   3569       exec_xpd(mach, inst);
   3570       break;
   3571 
   3572    case TGSI_OPCODE_ABS:
   3573       exec_vector_unary(mach, inst, micro_abs, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3574       break;
   3575 
   3576    case TGSI_OPCODE_RCC:
   3577       exec_scalar_unary(mach, inst, micro_rcc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3578       break;
   3579 
   3580    case TGSI_OPCODE_DPH:
   3581       exec_dph(mach, inst);
   3582       break;
   3583 
   3584    case TGSI_OPCODE_COS:
   3585       exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3586       break;
   3587 
   3588    case TGSI_OPCODE_DDX:
   3589       exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3590       break;
   3591 
   3592    case TGSI_OPCODE_DDY:
   3593       exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3594       break;
   3595 
   3596    case TGSI_OPCODE_KILP:
   3597       exec_kilp (mach, inst);
   3598       break;
   3599 
   3600    case TGSI_OPCODE_KIL:
   3601       exec_kil (mach, inst);
   3602       break;
   3603 
   3604    case TGSI_OPCODE_PK2H:
   3605       assert (0);
   3606       break;
   3607 
   3608    case TGSI_OPCODE_PK2US:
   3609       assert (0);
   3610       break;
   3611 
   3612    case TGSI_OPCODE_PK4B:
   3613       assert (0);
   3614       break;
   3615 
   3616    case TGSI_OPCODE_PK4UB:
   3617       assert (0);
   3618       break;
   3619 
   3620    case TGSI_OPCODE_RFL:
   3621       exec_rfl(mach, inst);
   3622       break;
   3623 
   3624    case TGSI_OPCODE_SEQ:
   3625       exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3626       break;
   3627 
   3628    case TGSI_OPCODE_SFL:
   3629       exec_vector(mach, inst, micro_sfl, TGSI_EXEC_DATA_FLOAT);
   3630       break;
   3631 
   3632    case TGSI_OPCODE_SGT:
   3633       exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3634       break;
   3635 
   3636    case TGSI_OPCODE_SIN:
   3637       exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3638       break;
   3639 
   3640    case TGSI_OPCODE_SLE:
   3641       exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3642       break;
   3643 
   3644    case TGSI_OPCODE_SNE:
   3645       exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3646       break;
   3647 
   3648    case TGSI_OPCODE_STR:
   3649       exec_vector(mach, inst, micro_str, TGSI_EXEC_DATA_FLOAT);
   3650       break;
   3651 
   3652    case TGSI_OPCODE_TEX:
   3653       /* simple texture lookup */
   3654       /* src[0] = texcoord */
   3655       /* src[1] = sampler unit */
   3656       exec_tex(mach, inst, TEX_MODIFIER_NONE);
   3657       break;
   3658 
   3659    case TGSI_OPCODE_TXB:
   3660       /* Texture lookup with lod bias */
   3661       /* src[0] = texcoord (src[0].w = LOD bias) */
   3662       /* src[1] = sampler unit */
   3663       exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS);
   3664       break;
   3665 
   3666    case TGSI_OPCODE_TXD:
   3667       /* Texture lookup with explict partial derivatives */
   3668       /* src[0] = texcoord */
   3669       /* src[1] = d[strq]/dx */
   3670       /* src[2] = d[strq]/dy */
   3671       /* src[3] = sampler unit */
   3672       exec_txd(mach, inst);
   3673       break;
   3674 
   3675    case TGSI_OPCODE_TXL:
   3676       /* Texture lookup with explit LOD */
   3677       /* src[0] = texcoord (src[0].w = LOD) */
   3678       /* src[1] = sampler unit */
   3679       exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
   3680       break;
   3681 
   3682    case TGSI_OPCODE_TXP:
   3683       /* Texture lookup with projection */
   3684       /* src[0] = texcoord (src[0].w = projection) */
   3685       /* src[1] = sampler unit */
   3686       exec_tex(mach, inst, TEX_MODIFIER_PROJECTED);
   3687       break;
   3688 
   3689    case TGSI_OPCODE_UP2H:
   3690       assert (0);
   3691       break;
   3692 
   3693    case TGSI_OPCODE_UP2US:
   3694       assert (0);
   3695       break;
   3696 
   3697    case TGSI_OPCODE_UP4B:
   3698       assert (0);
   3699       break;
   3700 
   3701    case TGSI_OPCODE_UP4UB:
   3702       assert (0);
   3703       break;
   3704 
   3705    case TGSI_OPCODE_X2D:
   3706       exec_x2d(mach, inst);
   3707       break;
   3708 
   3709    case TGSI_OPCODE_ARA:
   3710       assert (0);
   3711       break;
   3712 
   3713    case TGSI_OPCODE_ARR:
   3714       exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
   3715       break;
   3716 
   3717    case TGSI_OPCODE_BRA:
   3718       assert (0);
   3719       break;
   3720 
   3721    case TGSI_OPCODE_CAL:
   3722       /* skip the call if no execution channels are enabled */
   3723       if (mach->ExecMask) {
   3724          /* do the call */
   3725 
   3726          /* First, record the depths of the execution stacks.
   3727           * This is important for deeply nested/looped return statements.
   3728           * We have to unwind the stacks by the correct amount.  For a
   3729           * real code generator, we could determine the number of entries
   3730           * to pop off each stack with simple static analysis and avoid
   3731           * implementing this data structure at run time.
   3732           */
   3733          mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
   3734          mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
   3735          mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
   3736          mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
   3737          mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
   3738          /* note that PC was already incremented above */
   3739          mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
   3740 
   3741          mach->CallStackTop++;
   3742 
   3743          /* Second, push the Cond, Loop, Cont, Func stacks */
   3744          assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
   3745          assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
   3746          assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
   3747          assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
   3748          assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
   3749          assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
   3750 
   3751          mach->CondStack[mach->CondStackTop++] = mach->CondMask;
   3752          mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
   3753          mach->ContStack[mach->ContStackTop++] = mach->ContMask;
   3754          mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
   3755          mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
   3756          mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
   3757 
   3758          /* Finally, jump to the subroutine */
   3759          *pc = inst->Label.Label;
   3760       }
   3761       break;
   3762 
   3763    case TGSI_OPCODE_RET:
   3764       mach->FuncMask &= ~mach->ExecMask;
   3765       UPDATE_EXEC_MASK(mach);
   3766 
   3767       if (mach->FuncMask == 0x0) {
   3768          /* really return now (otherwise, keep executing */
   3769 
   3770          if (mach->CallStackTop == 0) {
   3771             /* returning from main() */
   3772             mach->CondStackTop = 0;
   3773             mach->LoopStackTop = 0;
   3774             *pc = -1;
   3775             return;
   3776          }
   3777 
   3778          assert(mach->CallStackTop > 0);
   3779          mach->CallStackTop--;
   3780 
   3781          mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
   3782          mach->CondMask = mach->CondStack[mach->CondStackTop];
   3783 
   3784          mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
   3785          mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
   3786 
   3787          mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
   3788          mach->ContMask = mach->ContStack[mach->ContStackTop];
   3789 
   3790          mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
   3791          mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
   3792 
   3793          mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
   3794          mach->BreakType = mach->BreakStack[mach->BreakStackTop];
   3795 
   3796          assert(mach->FuncStackTop > 0);
   3797          mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
   3798 
   3799          *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
   3800 
   3801          UPDATE_EXEC_MASK(mach);
   3802       }
   3803       break;
   3804 
   3805    case TGSI_OPCODE_SSG:
   3806       exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3807       break;
   3808 
   3809    case TGSI_OPCODE_CMP:
   3810       exec_vector_trinary(mach, inst, micro_cmp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3811       break;
   3812 
   3813    case TGSI_OPCODE_SCS:
   3814       exec_scs(mach, inst);
   3815       break;
   3816 
   3817    case TGSI_OPCODE_NRM:
   3818       exec_nrm3(mach, inst);
   3819       break;
   3820 
   3821    case TGSI_OPCODE_NRM4:
   3822       exec_nrm4(mach, inst);
   3823       break;
   3824 
   3825    case TGSI_OPCODE_DIV:
   3826       exec_vector_binary(mach, inst, micro_div, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3827       break;
   3828 
   3829    case TGSI_OPCODE_DP2:
   3830       exec_dp2(mach, inst);
   3831       break;
   3832 
   3833    case TGSI_OPCODE_IF:
   3834       /* push CondMask */
   3835       assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
   3836       mach->CondStack[mach->CondStackTop++] = mach->CondMask;
   3837       FETCH( &r[0], 0, TGSI_CHAN_X );
   3838       /* update CondMask */
   3839       if( ! r[0].u[0] ) {
   3840          mach->CondMask &= ~0x1;
   3841       }
   3842       if( ! r[0].u[1] ) {
   3843          mach->CondMask &= ~0x2;
   3844       }
   3845       if( ! r[0].u[2] ) {
   3846          mach->CondMask &= ~0x4;
   3847       }
   3848       if( ! r[0].u[3] ) {
   3849          mach->CondMask &= ~0x8;
   3850       }
   3851       UPDATE_EXEC_MASK(mach);
   3852       /* Todo: If CondMask==0, jump to ELSE */
   3853       break;
   3854 
   3855    case TGSI_OPCODE_ELSE:
   3856       /* invert CondMask wrt previous mask */
   3857       {
   3858          uint prevMask;
   3859          assert(mach->CondStackTop > 0);
   3860          prevMask = mach->CondStack[mach->CondStackTop - 1];
   3861          mach->CondMask = ~mach->CondMask & prevMask;
   3862          UPDATE_EXEC_MASK(mach);
   3863          /* Todo: If CondMask==0, jump to ENDIF */
   3864       }
   3865       break;
   3866 
   3867    case TGSI_OPCODE_ENDIF:
   3868       /* pop CondMask */
   3869       assert(mach->CondStackTop > 0);
   3870       mach->CondMask = mach->CondStack[--mach->CondStackTop];
   3871       UPDATE_EXEC_MASK(mach);
   3872       break;
   3873 
   3874    case TGSI_OPCODE_END:
   3875       /* make sure we end primitives which haven't
   3876        * been explicitly emitted */
   3877       conditional_emit_primitive(mach);
   3878       /* halt execution */
   3879       *pc = -1;
   3880       break;
   3881 
   3882    case TGSI_OPCODE_PUSHA:
   3883       assert (0);
   3884       break;
   3885 
   3886    case TGSI_OPCODE_POPA:
   3887       assert (0);
   3888       break;
   3889 
   3890    case TGSI_OPCODE_CEIL:
   3891       exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3892       break;
   3893 
   3894    case TGSI_OPCODE_I2F:
   3895       exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
   3896       break;
   3897 
   3898    case TGSI_OPCODE_NOT:
   3899       exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   3900       break;
   3901 
   3902    case TGSI_OPCODE_TRUNC:
   3903       exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   3904       break;
   3905 
   3906    case TGSI_OPCODE_SHL:
   3907       exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   3908       break;
   3909 
   3910    case TGSI_OPCODE_AND:
   3911       exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   3912       break;
   3913 
   3914    case TGSI_OPCODE_OR:
   3915       exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   3916       break;
   3917 
   3918    case TGSI_OPCODE_MOD:
   3919       exec_vector_binary(mach, inst, micro_mod, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   3920       break;
   3921 
   3922    case TGSI_OPCODE_XOR:
   3923       exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   3924       break;
   3925 
   3926    case TGSI_OPCODE_SAD:
   3927       assert (0);
   3928       break;
   3929 
   3930    case TGSI_OPCODE_TXF:
   3931       exec_txf(mach, inst);
   3932       break;
   3933 
   3934    case TGSI_OPCODE_TXQ:
   3935       exec_txq(mach, inst);
   3936       break;
   3937 
   3938    case TGSI_OPCODE_EMIT:
   3939       emit_vertex(mach);
   3940       break;
   3941 
   3942    case TGSI_OPCODE_ENDPRIM:
   3943       emit_primitive(mach);
   3944       break;
   3945 
   3946    case TGSI_OPCODE_BGNLOOP:
   3947       /* push LoopMask and ContMasks */
   3948       assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
   3949       assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
   3950       assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
   3951       assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
   3952 
   3953       mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
   3954       mach->ContStack[mach->ContStackTop++] = mach->ContMask;
   3955       mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
   3956       mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
   3957       mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
   3958       break;
   3959 
   3960    case TGSI_OPCODE_ENDLOOP:
   3961       /* Restore ContMask, but don't pop */
   3962       assert(mach->ContStackTop > 0);
   3963       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
   3964       UPDATE_EXEC_MASK(mach);
   3965       if (mach->ExecMask) {
   3966          /* repeat loop: jump to instruction just past BGNLOOP */
   3967          assert(mach->LoopLabelStackTop > 0);
   3968          *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
   3969       }
   3970       else {
   3971          /* exit loop: pop LoopMask */
   3972          assert(mach->LoopStackTop > 0);
   3973          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
   3974          /* pop ContMask */
   3975          assert(mach->ContStackTop > 0);
   3976          mach->ContMask = mach->ContStack[--mach->ContStackTop];
   3977          assert(mach->LoopLabelStackTop > 0);
   3978          --mach->LoopLabelStackTop;
   3979 
   3980          mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
   3981       }
   3982       UPDATE_EXEC_MASK(mach);
   3983       break;
   3984 
   3985    case TGSI_OPCODE_BRK:
   3986       exec_break(mach);
   3987       break;
   3988 
   3989    case TGSI_OPCODE_CONT:
   3990       /* turn off cont channels for each enabled exec channel */
   3991       mach->ContMask &= ~mach->ExecMask;
   3992       /* Todo: if mach->LoopMask == 0, jump to end of loop */
   3993       UPDATE_EXEC_MASK(mach);
   3994       break;
   3995 
   3996    case TGSI_OPCODE_BGNSUB:
   3997       /* no-op */
   3998       break;
   3999 
   4000    case TGSI_OPCODE_ENDSUB:
   4001       /*
   4002        * XXX: This really should be a no-op. We should never reach this opcode.
   4003        */
   4004 
   4005       assert(mach->CallStackTop > 0);
   4006       mach->CallStackTop--;
   4007 
   4008       mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
   4009       mach->CondMask = mach->CondStack[mach->CondStackTop];
   4010 
   4011       mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
   4012       mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
   4013 
   4014       mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
   4015       mach->ContMask = mach->ContStack[mach->ContStackTop];
   4016 
   4017       mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
   4018       mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
   4019 
   4020       mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
   4021       mach->BreakType = mach->BreakStack[mach->BreakStackTop];
   4022 
   4023       assert(mach->FuncStackTop > 0);
   4024       mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
   4025 
   4026       *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
   4027 
   4028       UPDATE_EXEC_MASK(mach);
   4029       break;
   4030 
   4031    case TGSI_OPCODE_NOP:
   4032       break;
   4033 
   4034    case TGSI_OPCODE_BREAKC:
   4035       FETCH(&r[0], 0, TGSI_CHAN_X);
   4036       /* update CondMask */
   4037       if (r[0].u[0] && (mach->ExecMask & 0x1)) {
   4038          mach->LoopMask &= ~0x1;
   4039       }
   4040       if (r[0].u[1] && (mach->ExecMask & 0x2)) {
   4041          mach->LoopMask &= ~0x2;
   4042       }
   4043       if (r[0].u[2] && (mach->ExecMask & 0x4)) {
   4044          mach->LoopMask &= ~0x4;
   4045       }
   4046       if (r[0].u[3] && (mach->ExecMask & 0x8)) {
   4047          mach->LoopMask &= ~0x8;
   4048       }
   4049       /* Todo: if mach->LoopMask == 0, jump to end of loop */
   4050       UPDATE_EXEC_MASK(mach);
   4051       break;
   4052 
   4053    case TGSI_OPCODE_F2I:
   4054       exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
   4055       break;
   4056 
   4057    case TGSI_OPCODE_IDIV:
   4058       exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   4059       break;
   4060 
   4061    case TGSI_OPCODE_IMAX:
   4062       exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   4063       break;
   4064 
   4065    case TGSI_OPCODE_IMIN:
   4066       exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   4067       break;
   4068 
   4069    case TGSI_OPCODE_INEG:
   4070       exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   4071       break;
   4072 
   4073    case TGSI_OPCODE_ISGE:
   4074       exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   4075       break;
   4076 
   4077    case TGSI_OPCODE_ISHR:
   4078       exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   4079       break;
   4080 
   4081    case TGSI_OPCODE_ISLT:
   4082       exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   4083       break;
   4084 
   4085    case TGSI_OPCODE_F2U:
   4086       exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
   4087       break;
   4088 
   4089    case TGSI_OPCODE_U2F:
   4090       exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
   4091       break;
   4092 
   4093    case TGSI_OPCODE_UADD:
   4094       exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   4095       break;
   4096 
   4097    case TGSI_OPCODE_UDIV:
   4098       exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   4099       break;
   4100 
   4101    case TGSI_OPCODE_UMAD:
   4102       exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   4103       break;
   4104 
   4105    case TGSI_OPCODE_UMAX:
   4106       exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   4107       break;
   4108 
   4109    case TGSI_OPCODE_UMIN:
   4110       exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   4111       break;
   4112 
   4113    case TGSI_OPCODE_UMOD:
   4114       exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   4115       break;
   4116 
   4117    case TGSI_OPCODE_UMUL:
   4118       exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   4119       break;
   4120 
   4121    case TGSI_OPCODE_USEQ:
   4122       exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   4123       break;
   4124 
   4125    case TGSI_OPCODE_USGE:
   4126       exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   4127       break;
   4128 
   4129    case TGSI_OPCODE_USHR:
   4130       exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   4131       break;
   4132 
   4133    case TGSI_OPCODE_USLT:
   4134       exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   4135       break;
   4136 
   4137    case TGSI_OPCODE_USNE:
   4138       exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   4139       break;
   4140 
   4141    case TGSI_OPCODE_SWITCH:
   4142       exec_switch(mach, inst);
   4143       break;
   4144 
   4145    case TGSI_OPCODE_CASE:
   4146       exec_case(mach, inst);
   4147       break;
   4148 
   4149    case TGSI_OPCODE_DEFAULT:
   4150       exec_default(mach);
   4151       break;
   4152 
   4153    case TGSI_OPCODE_ENDSWITCH:
   4154       exec_endswitch(mach);
   4155       break;
   4156 
   4157    case TGSI_OPCODE_SAMPLE_I:
   4158       assert(0);
   4159       break;
   4160 
   4161    case TGSI_OPCODE_SAMPLE_I_MS:
   4162       assert(0);
   4163       break;
   4164 
   4165    case TGSI_OPCODE_SAMPLE:
   4166       exec_sample(mach, inst, TEX_MODIFIER_NONE);
   4167       break;
   4168 
   4169    case TGSI_OPCODE_SAMPLE_B:
   4170       exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS);
   4171       break;
   4172 
   4173    case TGSI_OPCODE_SAMPLE_C:
   4174       exec_sample(mach, inst, TEX_MODIFIER_NONE);
   4175       break;
   4176 
   4177    case TGSI_OPCODE_SAMPLE_C_LZ:
   4178       exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS);
   4179       break;
   4180 
   4181    case TGSI_OPCODE_SAMPLE_D:
   4182       exec_sample_d(mach, inst);
   4183       break;
   4184 
   4185    case TGSI_OPCODE_SAMPLE_L:
   4186       exec_sample(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
   4187       break;
   4188 
   4189    case TGSI_OPCODE_GATHER4:
   4190       assert(0);
   4191       break;
   4192 
   4193    case TGSI_OPCODE_SVIEWINFO:
   4194       assert(0);
   4195       break;
   4196 
   4197    case TGSI_OPCODE_SAMPLE_POS:
   4198       assert(0);
   4199       break;
   4200 
   4201    case TGSI_OPCODE_SAMPLE_INFO:
   4202       assert(0);
   4203       break;
   4204 
   4205    case TGSI_OPCODE_UARL:
   4206       exec_vector_unary(mach, inst, micro_uarl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
   4207       break;
   4208 
   4209    case TGSI_OPCODE_UCMP:
   4210       exec_vector_trinary(mach, inst, micro_ucmp, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   4211       break;
   4212 
   4213    case TGSI_OPCODE_IABS:
   4214       exec_vector_unary(mach, inst, micro_iabs, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   4215       break;
   4216 
   4217    case TGSI_OPCODE_ISSG:
   4218       exec_vector_unary(mach, inst, micro_isgn, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   4219       break;
   4220 
   4221    default:
   4222       assert( 0 );
   4223    }
   4224 }
   4225 
   4226 
   4227 #define DEBUG_EXECUTION 0
   4228 
   4229 
   4230 /**
   4231  * Run TGSI interpreter.
   4232  * \return bitmask of "alive" quad components
   4233  */
   4234 uint
   4235 tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
   4236 {
   4237    uint i;
   4238    int pc = 0;
   4239 
   4240    mach->CondMask = 0xf;
   4241    mach->LoopMask = 0xf;
   4242    mach->ContMask = 0xf;
   4243    mach->FuncMask = 0xf;
   4244    mach->ExecMask = 0xf;
   4245 
   4246    mach->Switch.mask = 0xf;
   4247 
   4248    assert(mach->CondStackTop == 0);
   4249    assert(mach->LoopStackTop == 0);
   4250    assert(mach->ContStackTop == 0);
   4251    assert(mach->SwitchStackTop == 0);
   4252    assert(mach->BreakStackTop == 0);
   4253    assert(mach->CallStackTop == 0);
   4254 
   4255    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
   4256    mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
   4257 
   4258    if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
   4259       mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
   4260       mach->Primitives[0] = 0;
   4261    }
   4262 
   4263    /* execute declarations (interpolants) */
   4264    for (i = 0; i < mach->NumDeclarations; i++) {
   4265       exec_declaration( mach, mach->Declarations+i );
   4266    }
   4267 
   4268    {
   4269 #if DEBUG_EXECUTION
   4270       struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
   4271       struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
   4272       uint inst = 1;
   4273 
   4274       memcpy(temps, mach->Temps, sizeof(temps));
   4275       memcpy(outputs, mach->Outputs, sizeof(outputs));
   4276 #endif
   4277 
   4278       /* execute instructions, until pc is set to -1 */
   4279       while (pc != -1) {
   4280 
   4281 #if DEBUG_EXECUTION
   4282          uint i;
   4283 
   4284          tgsi_dump_instruction(&mach->Instructions[pc], inst++);
   4285 #endif
   4286 
   4287          assert(pc < (int) mach->NumInstructions);
   4288          exec_instruction(mach, mach->Instructions + pc, &pc);
   4289 
   4290 #if DEBUG_EXECUTION
   4291          for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
   4292             if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
   4293                uint j;
   4294 
   4295                memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
   4296                debug_printf("TEMP[%2u] = ", i);
   4297                for (j = 0; j < 4; j++) {
   4298                   if (j > 0) {
   4299                      debug_printf("           ");
   4300                   }
   4301                   debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
   4302                                temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
   4303                                temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
   4304                                temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
   4305                                temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
   4306                }
   4307             }
   4308          }
   4309          for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
   4310             if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
   4311                uint j;
   4312 
   4313                memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
   4314                debug_printf("OUT[%2u] =  ", i);
   4315                for (j = 0; j < 4; j++) {
   4316                   if (j > 0) {
   4317                      debug_printf("           ");
   4318                   }
   4319                   debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
   4320                                outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
   4321                                outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
   4322                                outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
   4323                                outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
   4324                }
   4325             }
   4326          }
   4327 #endif
   4328       }
   4329    }
   4330 
   4331 #if 0
   4332    /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
   4333    if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
   4334       /*
   4335        * Scale back depth component.
   4336        */
   4337       for (i = 0; i < 4; i++)
   4338          mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
   4339    }
   4340 #endif
   4341 
   4342    /* Strictly speaking, these assertions aren't really needed but they
   4343     * can potentially catch some bugs in the control flow code.
   4344     */
   4345    assert(mach->CondStackTop == 0);
   4346    assert(mach->LoopStackTop == 0);
   4347    assert(mach->ContStackTop == 0);
   4348    assert(mach->SwitchStackTop == 0);
   4349    assert(mach->BreakStackTop == 0);
   4350    assert(mach->CallStackTop == 0);
   4351 
   4352    return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
   4353 }
   4354