Home | History | Annotate | Download | only in tgsi
      1 /**************************************************************************
      2  *
      3  * Copyright 2007-2008 VMware, Inc.
      4  * All Rights Reserved.
      5  * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
      6  *
      7  * Permission is hereby granted, free of charge, to any person obtaining a
      8  * copy of this software and associated documentation files (the
      9  * "Software"), to deal in the Software without restriction, including
     10  * without limitation the rights to use, copy, modify, merge, publish,
     11  * distribute, sub license, and/or sell copies of the Software, and to
     12  * permit persons to whom the Software is furnished to do so, subject to
     13  * the following conditions:
     14  *
     15  * The above copyright notice and this permission notice (including the
     16  * next paragraph) shall be included in all copies or substantial portions
     17  * of the Software.
     18  *
     19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
     22  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
     23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     26  *
     27  **************************************************************************/
     28 
     29 /**
     30  * TGSI interpreter/executor.
     31  *
     32  * Flow control information:
     33  *
     34  * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
     35  * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
     36  * care since a condition may be true for some quad components but false
     37  * for other components.
     38  *
     39  * We basically execute all statements (even if they're in the part of
     40  * an IF/ELSE clause that's "not taken") and use a special mask to
     41  * control writing to destination registers.  This is the ExecMask.
     42  * See store_dest().
     43  *
     44  * The ExecMask is computed from three other masks (CondMask, LoopMask and
     45  * ContMask) which are controlled by the flow control instructions (namely:
     46  * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
     47  *
     48  *
     49  * Authors:
     50  *   Michal Krol
     51  *   Brian Paul
     52  */
     53 
     54 #include "pipe/p_compiler.h"
     55 #include "pipe/p_state.h"
     56 #include "pipe/p_shader_tokens.h"
     57 #include "tgsi/tgsi_dump.h"
     58 #include "tgsi/tgsi_parse.h"
     59 #include "tgsi/tgsi_util.h"
     60 #include "tgsi_exec.h"
     61 #include "util/u_half.h"
     62 #include "util/u_memory.h"
     63 #include "util/u_math.h"
     64 #include "util/rounding.h"
     65 
     66 
     67 #define DEBUG_EXECUTION 0
     68 
     69 
     70 #define FAST_MATH 0
     71 
     72 #define TILE_TOP_LEFT     0
     73 #define TILE_TOP_RIGHT    1
     74 #define TILE_BOTTOM_LEFT  2
     75 #define TILE_BOTTOM_RIGHT 3
     76 
     77 union tgsi_double_channel {
     78    double d[TGSI_QUAD_SIZE];
     79    unsigned u[TGSI_QUAD_SIZE][2];
     80    uint64_t u64[TGSI_QUAD_SIZE];
     81    int64_t i64[TGSI_QUAD_SIZE];
     82 };
     83 
     84 struct tgsi_double_vector {
     85    union tgsi_double_channel xy;
     86    union tgsi_double_channel zw;
     87 };
     88 
     89 static void
     90 micro_abs(union tgsi_exec_channel *dst,
     91           const union tgsi_exec_channel *src)
     92 {
     93    dst->f[0] = fabsf(src->f[0]);
     94    dst->f[1] = fabsf(src->f[1]);
     95    dst->f[2] = fabsf(src->f[2]);
     96    dst->f[3] = fabsf(src->f[3]);
     97 }
     98 
     99 static void
    100 micro_arl(union tgsi_exec_channel *dst,
    101           const union tgsi_exec_channel *src)
    102 {
    103    dst->i[0] = (int)floorf(src->f[0]);
    104    dst->i[1] = (int)floorf(src->f[1]);
    105    dst->i[2] = (int)floorf(src->f[2]);
    106    dst->i[3] = (int)floorf(src->f[3]);
    107 }
    108 
    109 static void
    110 micro_arr(union tgsi_exec_channel *dst,
    111           const union tgsi_exec_channel *src)
    112 {
    113    dst->i[0] = (int)floorf(src->f[0] + 0.5f);
    114    dst->i[1] = (int)floorf(src->f[1] + 0.5f);
    115    dst->i[2] = (int)floorf(src->f[2] + 0.5f);
    116    dst->i[3] = (int)floorf(src->f[3] + 0.5f);
    117 }
    118 
    119 static void
    120 micro_ceil(union tgsi_exec_channel *dst,
    121            const union tgsi_exec_channel *src)
    122 {
    123    dst->f[0] = ceilf(src->f[0]);
    124    dst->f[1] = ceilf(src->f[1]);
    125    dst->f[2] = ceilf(src->f[2]);
    126    dst->f[3] = ceilf(src->f[3]);
    127 }
    128 
    129 static void
    130 micro_clamp(union tgsi_exec_channel *dst,
    131             const union tgsi_exec_channel *src0,
    132             const union tgsi_exec_channel *src1,
    133             const union tgsi_exec_channel *src2)
    134 {
    135    dst->f[0] = src0->f[0] < src1->f[0] ? src1->f[0] : src0->f[0] > src2->f[0] ? src2->f[0] : src0->f[0];
    136    dst->f[1] = src0->f[1] < src1->f[1] ? src1->f[1] : src0->f[1] > src2->f[1] ? src2->f[1] : src0->f[1];
    137    dst->f[2] = src0->f[2] < src1->f[2] ? src1->f[2] : src0->f[2] > src2->f[2] ? src2->f[2] : src0->f[2];
    138    dst->f[3] = src0->f[3] < src1->f[3] ? src1->f[3] : src0->f[3] > src2->f[3] ? src2->f[3] : src0->f[3];
    139 }
    140 
    141 static void
    142 micro_cmp(union tgsi_exec_channel *dst,
    143           const union tgsi_exec_channel *src0,
    144           const union tgsi_exec_channel *src1,
    145           const union tgsi_exec_channel *src2)
    146 {
    147    dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
    148    dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
    149    dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
    150    dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
    151 }
    152 
    153 static void
    154 micro_cos(union tgsi_exec_channel *dst,
    155           const union tgsi_exec_channel *src)
    156 {
    157    dst->f[0] = cosf(src->f[0]);
    158    dst->f[1] = cosf(src->f[1]);
    159    dst->f[2] = cosf(src->f[2]);
    160    dst->f[3] = cosf(src->f[3]);
    161 }
    162 
    163 static void
    164 micro_d2f(union tgsi_exec_channel *dst,
    165           const union tgsi_double_channel *src)
    166 {
    167    dst->f[0] = (float)src->d[0];
    168    dst->f[1] = (float)src->d[1];
    169    dst->f[2] = (float)src->d[2];
    170    dst->f[3] = (float)src->d[3];
    171 }
    172 
    173 static void
    174 micro_d2i(union tgsi_exec_channel *dst,
    175           const union tgsi_double_channel *src)
    176 {
    177    dst->i[0] = (int)src->d[0];
    178    dst->i[1] = (int)src->d[1];
    179    dst->i[2] = (int)src->d[2];
    180    dst->i[3] = (int)src->d[3];
    181 }
    182 
    183 static void
    184 micro_d2u(union tgsi_exec_channel *dst,
    185           const union tgsi_double_channel *src)
    186 {
    187    dst->u[0] = (unsigned)src->d[0];
    188    dst->u[1] = (unsigned)src->d[1];
    189    dst->u[2] = (unsigned)src->d[2];
    190    dst->u[3] = (unsigned)src->d[3];
    191 }
    192 static void
    193 micro_dabs(union tgsi_double_channel *dst,
    194            const union tgsi_double_channel *src)
    195 {
    196    dst->d[0] = src->d[0] >= 0.0 ? src->d[0] : -src->d[0];
    197    dst->d[1] = src->d[1] >= 0.0 ? src->d[1] : -src->d[1];
    198    dst->d[2] = src->d[2] >= 0.0 ? src->d[2] : -src->d[2];
    199    dst->d[3] = src->d[3] >= 0.0 ? src->d[3] : -src->d[3];
    200 }
    201 
    202 static void
    203 micro_dadd(union tgsi_double_channel *dst,
    204           const union tgsi_double_channel *src)
    205 {
    206    dst->d[0] = src[0].d[0] + src[1].d[0];
    207    dst->d[1] = src[0].d[1] + src[1].d[1];
    208    dst->d[2] = src[0].d[2] + src[1].d[2];
    209    dst->d[3] = src[0].d[3] + src[1].d[3];
    210 }
    211 
    212 static void
    213 micro_ddiv(union tgsi_double_channel *dst,
    214           const union tgsi_double_channel *src)
    215 {
    216    dst->d[0] = src[0].d[0] / src[1].d[0];
    217    dst->d[1] = src[0].d[1] / src[1].d[1];
    218    dst->d[2] = src[0].d[2] / src[1].d[2];
    219    dst->d[3] = src[0].d[3] / src[1].d[3];
    220 }
    221 
    222 static void
    223 micro_ddx(union tgsi_exec_channel *dst,
    224           const union tgsi_exec_channel *src)
    225 {
    226    dst->f[0] =
    227    dst->f[1] =
    228    dst->f[2] =
    229    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
    230 }
    231 
    232 static void
    233 micro_ddy(union tgsi_exec_channel *dst,
    234           const union tgsi_exec_channel *src)
    235 {
    236    dst->f[0] =
    237    dst->f[1] =
    238    dst->f[2] =
    239    dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
    240 }
    241 
    242 static void
    243 micro_dmul(union tgsi_double_channel *dst,
    244            const union tgsi_double_channel *src)
    245 {
    246    dst->d[0] = src[0].d[0] * src[1].d[0];
    247    dst->d[1] = src[0].d[1] * src[1].d[1];
    248    dst->d[2] = src[0].d[2] * src[1].d[2];
    249    dst->d[3] = src[0].d[3] * src[1].d[3];
    250 }
    251 
    252 static void
    253 micro_dmax(union tgsi_double_channel *dst,
    254            const union tgsi_double_channel *src)
    255 {
    256    dst->d[0] = src[0].d[0] > src[1].d[0] ? src[0].d[0] : src[1].d[0];
    257    dst->d[1] = src[0].d[1] > src[1].d[1] ? src[0].d[1] : src[1].d[1];
    258    dst->d[2] = src[0].d[2] > src[1].d[2] ? src[0].d[2] : src[1].d[2];
    259    dst->d[3] = src[0].d[3] > src[1].d[3] ? src[0].d[3] : src[1].d[3];
    260 }
    261 
    262 static void
    263 micro_dmin(union tgsi_double_channel *dst,
    264            const union tgsi_double_channel *src)
    265 {
    266    dst->d[0] = src[0].d[0] < src[1].d[0] ? src[0].d[0] : src[1].d[0];
    267    dst->d[1] = src[0].d[1] < src[1].d[1] ? src[0].d[1] : src[1].d[1];
    268    dst->d[2] = src[0].d[2] < src[1].d[2] ? src[0].d[2] : src[1].d[2];
    269    dst->d[3] = src[0].d[3] < src[1].d[3] ? src[0].d[3] : src[1].d[3];
    270 }
    271 
    272 static void
    273 micro_dneg(union tgsi_double_channel *dst,
    274            const union tgsi_double_channel *src)
    275 {
    276    dst->d[0] = -src->d[0];
    277    dst->d[1] = -src->d[1];
    278    dst->d[2] = -src->d[2];
    279    dst->d[3] = -src->d[3];
    280 }
    281 
    282 static void
    283 micro_dslt(union tgsi_double_channel *dst,
    284            const union tgsi_double_channel *src)
    285 {
    286    dst->u[0][0] = src[0].d[0] < src[1].d[0] ? ~0U : 0U;
    287    dst->u[1][0] = src[0].d[1] < src[1].d[1] ? ~0U : 0U;
    288    dst->u[2][0] = src[0].d[2] < src[1].d[2] ? ~0U : 0U;
    289    dst->u[3][0] = src[0].d[3] < src[1].d[3] ? ~0U : 0U;
    290 }
    291 
    292 static void
    293 micro_dsne(union tgsi_double_channel *dst,
    294            const union tgsi_double_channel *src)
    295 {
    296    dst->u[0][0] = src[0].d[0] != src[1].d[0] ? ~0U : 0U;
    297    dst->u[1][0] = src[0].d[1] != src[1].d[1] ? ~0U : 0U;
    298    dst->u[2][0] = src[0].d[2] != src[1].d[2] ? ~0U : 0U;
    299    dst->u[3][0] = src[0].d[3] != src[1].d[3] ? ~0U : 0U;
    300 }
    301 
    302 static void
    303 micro_dsge(union tgsi_double_channel *dst,
    304            const union tgsi_double_channel *src)
    305 {
    306    dst->u[0][0] = src[0].d[0] >= src[1].d[0] ? ~0U : 0U;
    307    dst->u[1][0] = src[0].d[1] >= src[1].d[1] ? ~0U : 0U;
    308    dst->u[2][0] = src[0].d[2] >= src[1].d[2] ? ~0U : 0U;
    309    dst->u[3][0] = src[0].d[3] >= src[1].d[3] ? ~0U : 0U;
    310 }
    311 
    312 static void
    313 micro_dseq(union tgsi_double_channel *dst,
    314            const union tgsi_double_channel *src)
    315 {
    316    dst->u[0][0] = src[0].d[0] == src[1].d[0] ? ~0U : 0U;
    317    dst->u[1][0] = src[0].d[1] == src[1].d[1] ? ~0U : 0U;
    318    dst->u[2][0] = src[0].d[2] == src[1].d[2] ? ~0U : 0U;
    319    dst->u[3][0] = src[0].d[3] == src[1].d[3] ? ~0U : 0U;
    320 }
    321 
    322 static void
    323 micro_drcp(union tgsi_double_channel *dst,
    324            const union tgsi_double_channel *src)
    325 {
    326    dst->d[0] = 1.0 / src->d[0];
    327    dst->d[1] = 1.0 / src->d[1];
    328    dst->d[2] = 1.0 / src->d[2];
    329    dst->d[3] = 1.0 / src->d[3];
    330 }
    331 
    332 static void
    333 micro_dsqrt(union tgsi_double_channel *dst,
    334             const union tgsi_double_channel *src)
    335 {
    336    dst->d[0] = sqrt(src->d[0]);
    337    dst->d[1] = sqrt(src->d[1]);
    338    dst->d[2] = sqrt(src->d[2]);
    339    dst->d[3] = sqrt(src->d[3]);
    340 }
    341 
    342 static void
    343 micro_drsq(union tgsi_double_channel *dst,
    344           const union tgsi_double_channel *src)
    345 {
    346    dst->d[0] = 1.0 / sqrt(src->d[0]);
    347    dst->d[1] = 1.0 / sqrt(src->d[1]);
    348    dst->d[2] = 1.0 / sqrt(src->d[2]);
    349    dst->d[3] = 1.0 / sqrt(src->d[3]);
    350 }
    351 
    352 static void
    353 micro_dmad(union tgsi_double_channel *dst,
    354            const union tgsi_double_channel *src)
    355 {
    356    dst->d[0] = src[0].d[0] * src[1].d[0] + src[2].d[0];
    357    dst->d[1] = src[0].d[1] * src[1].d[1] + src[2].d[1];
    358    dst->d[2] = src[0].d[2] * src[1].d[2] + src[2].d[2];
    359    dst->d[3] = src[0].d[3] * src[1].d[3] + src[2].d[3];
    360 }
    361 
    362 static void
    363 micro_dfrac(union tgsi_double_channel *dst,
    364             const union tgsi_double_channel *src)
    365 {
    366    dst->d[0] = src->d[0] - floor(src->d[0]);
    367    dst->d[1] = src->d[1] - floor(src->d[1]);
    368    dst->d[2] = src->d[2] - floor(src->d[2]);
    369    dst->d[3] = src->d[3] - floor(src->d[3]);
    370 }
    371 
    372 static void
    373 micro_dldexp(union tgsi_double_channel *dst,
    374              const union tgsi_double_channel *src0,
    375              union tgsi_exec_channel *src1)
    376 {
    377    dst->d[0] = ldexp(src0->d[0], src1->i[0]);
    378    dst->d[1] = ldexp(src0->d[1], src1->i[1]);
    379    dst->d[2] = ldexp(src0->d[2], src1->i[2]);
    380    dst->d[3] = ldexp(src0->d[3], src1->i[3]);
    381 }
    382 
    383 static void
    384 micro_dfracexp(union tgsi_double_channel *dst,
    385                union tgsi_exec_channel *dst_exp,
    386                const union tgsi_double_channel *src)
    387 {
    388    dst->d[0] = frexp(src->d[0], &dst_exp->i[0]);
    389    dst->d[1] = frexp(src->d[1], &dst_exp->i[1]);
    390    dst->d[2] = frexp(src->d[2], &dst_exp->i[2]);
    391    dst->d[3] = frexp(src->d[3], &dst_exp->i[3]);
    392 }
    393 
    394 static void
    395 micro_exp2(union tgsi_exec_channel *dst,
    396            const union tgsi_exec_channel *src)
    397 {
    398 #if FAST_MATH
    399    dst->f[0] = util_fast_exp2(src->f[0]);
    400    dst->f[1] = util_fast_exp2(src->f[1]);
    401    dst->f[2] = util_fast_exp2(src->f[2]);
    402    dst->f[3] = util_fast_exp2(src->f[3]);
    403 #else
    404 #if DEBUG
    405    /* Inf is okay for this instruction, so clamp it to silence assertions. */
    406    uint i;
    407    union tgsi_exec_channel clamped;
    408 
    409    for (i = 0; i < 4; i++) {
    410       if (src->f[i] > 127.99999f) {
    411          clamped.f[i] = 127.99999f;
    412       } else if (src->f[i] < -126.99999f) {
    413          clamped.f[i] = -126.99999f;
    414       } else {
    415          clamped.f[i] = src->f[i];
    416       }
    417    }
    418    src = &clamped;
    419 #endif /* DEBUG */
    420 
    421    dst->f[0] = powf(2.0f, src->f[0]);
    422    dst->f[1] = powf(2.0f, src->f[1]);
    423    dst->f[2] = powf(2.0f, src->f[2]);
    424    dst->f[3] = powf(2.0f, src->f[3]);
    425 #endif /* FAST_MATH */
    426 }
    427 
    428 static void
    429 micro_f2d(union tgsi_double_channel *dst,
    430           const union tgsi_exec_channel *src)
    431 {
    432    dst->d[0] = (double)src->f[0];
    433    dst->d[1] = (double)src->f[1];
    434    dst->d[2] = (double)src->f[2];
    435    dst->d[3] = (double)src->f[3];
    436 }
    437 
    438 static void
    439 micro_flr(union tgsi_exec_channel *dst,
    440           const union tgsi_exec_channel *src)
    441 {
    442    dst->f[0] = floorf(src->f[0]);
    443    dst->f[1] = floorf(src->f[1]);
    444    dst->f[2] = floorf(src->f[2]);
    445    dst->f[3] = floorf(src->f[3]);
    446 }
    447 
    448 static void
    449 micro_frc(union tgsi_exec_channel *dst,
    450           const union tgsi_exec_channel *src)
    451 {
    452    dst->f[0] = src->f[0] - floorf(src->f[0]);
    453    dst->f[1] = src->f[1] - floorf(src->f[1]);
    454    dst->f[2] = src->f[2] - floorf(src->f[2]);
    455    dst->f[3] = src->f[3] - floorf(src->f[3]);
    456 }
    457 
    458 static void
    459 micro_i2d(union tgsi_double_channel *dst,
    460           const union tgsi_exec_channel *src)
    461 {
    462    dst->d[0] = (double)src->i[0];
    463    dst->d[1] = (double)src->i[1];
    464    dst->d[2] = (double)src->i[2];
    465    dst->d[3] = (double)src->i[3];
    466 }
    467 
    468 static void
    469 micro_iabs(union tgsi_exec_channel *dst,
    470            const union tgsi_exec_channel *src)
    471 {
    472    dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
    473    dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
    474    dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
    475    dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
    476 }
    477 
    478 static void
    479 micro_ineg(union tgsi_exec_channel *dst,
    480            const union tgsi_exec_channel *src)
    481 {
    482    dst->i[0] = -src->i[0];
    483    dst->i[1] = -src->i[1];
    484    dst->i[2] = -src->i[2];
    485    dst->i[3] = -src->i[3];
    486 }
    487 
    488 static void
    489 micro_lg2(union tgsi_exec_channel *dst,
    490           const union tgsi_exec_channel *src)
    491 {
    492 #if FAST_MATH
    493    dst->f[0] = util_fast_log2(src->f[0]);
    494    dst->f[1] = util_fast_log2(src->f[1]);
    495    dst->f[2] = util_fast_log2(src->f[2]);
    496    dst->f[3] = util_fast_log2(src->f[3]);
    497 #else
    498    dst->f[0] = logf(src->f[0]) * 1.442695f;
    499    dst->f[1] = logf(src->f[1]) * 1.442695f;
    500    dst->f[2] = logf(src->f[2]) * 1.442695f;
    501    dst->f[3] = logf(src->f[3]) * 1.442695f;
    502 #endif
    503 }
    504 
    505 static void
    506 micro_lrp(union tgsi_exec_channel *dst,
    507           const union tgsi_exec_channel *src0,
    508           const union tgsi_exec_channel *src1,
    509           const union tgsi_exec_channel *src2)
    510 {
    511    dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
    512    dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
    513    dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
    514    dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
    515 }
    516 
    517 static void
    518 micro_mad(union tgsi_exec_channel *dst,
    519           const union tgsi_exec_channel *src0,
    520           const union tgsi_exec_channel *src1,
    521           const union tgsi_exec_channel *src2)
    522 {
    523    dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
    524    dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
    525    dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
    526    dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
    527 }
    528 
    529 static void
    530 micro_mov(union tgsi_exec_channel *dst,
    531           const union tgsi_exec_channel *src)
    532 {
    533    dst->u[0] = src->u[0];
    534    dst->u[1] = src->u[1];
    535    dst->u[2] = src->u[2];
    536    dst->u[3] = src->u[3];
    537 }
    538 
    539 static void
    540 micro_rcp(union tgsi_exec_channel *dst,
    541           const union tgsi_exec_channel *src)
    542 {
    543 #if 0 /* for debugging */
    544    assert(src->f[0] != 0.0f);
    545    assert(src->f[1] != 0.0f);
    546    assert(src->f[2] != 0.0f);
    547    assert(src->f[3] != 0.0f);
    548 #endif
    549    dst->f[0] = 1.0f / src->f[0];
    550    dst->f[1] = 1.0f / src->f[1];
    551    dst->f[2] = 1.0f / src->f[2];
    552    dst->f[3] = 1.0f / src->f[3];
    553 }
    554 
    555 static void
    556 micro_rnd(union tgsi_exec_channel *dst,
    557           const union tgsi_exec_channel *src)
    558 {
    559    dst->f[0] = _mesa_roundevenf(src->f[0]);
    560    dst->f[1] = _mesa_roundevenf(src->f[1]);
    561    dst->f[2] = _mesa_roundevenf(src->f[2]);
    562    dst->f[3] = _mesa_roundevenf(src->f[3]);
    563 }
    564 
    565 static void
    566 micro_rsq(union tgsi_exec_channel *dst,
    567           const union tgsi_exec_channel *src)
    568 {
    569 #if 0 /* for debugging */
    570    assert(src->f[0] != 0.0f);
    571    assert(src->f[1] != 0.0f);
    572    assert(src->f[2] != 0.0f);
    573    assert(src->f[3] != 0.0f);
    574 #endif
    575    dst->f[0] = 1.0f / sqrtf(src->f[0]);
    576    dst->f[1] = 1.0f / sqrtf(src->f[1]);
    577    dst->f[2] = 1.0f / sqrtf(src->f[2]);
    578    dst->f[3] = 1.0f / sqrtf(src->f[3]);
    579 }
    580 
    581 static void
    582 micro_sqrt(union tgsi_exec_channel *dst,
    583            const union tgsi_exec_channel *src)
    584 {
    585    dst->f[0] = sqrtf(src->f[0]);
    586    dst->f[1] = sqrtf(src->f[1]);
    587    dst->f[2] = sqrtf(src->f[2]);
    588    dst->f[3] = sqrtf(src->f[3]);
    589 }
    590 
    591 static void
    592 micro_seq(union tgsi_exec_channel *dst,
    593           const union tgsi_exec_channel *src0,
    594           const union tgsi_exec_channel *src1)
    595 {
    596    dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
    597    dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
    598    dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
    599    dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
    600 }
    601 
    602 static void
    603 micro_sge(union tgsi_exec_channel *dst,
    604           const union tgsi_exec_channel *src0,
    605           const union tgsi_exec_channel *src1)
    606 {
    607    dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
    608    dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
    609    dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
    610    dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
    611 }
    612 
    613 static void
    614 micro_sgn(union tgsi_exec_channel *dst,
    615           const union tgsi_exec_channel *src)
    616 {
    617    dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
    618    dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
    619    dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
    620    dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
    621 }
    622 
    623 static void
    624 micro_isgn(union tgsi_exec_channel *dst,
    625           const union tgsi_exec_channel *src)
    626 {
    627    dst->i[0] = src->i[0] < 0 ? -1 : src->i[0] > 0 ? 1 : 0;
    628    dst->i[1] = src->i[1] < 0 ? -1 : src->i[1] > 0 ? 1 : 0;
    629    dst->i[2] = src->i[2] < 0 ? -1 : src->i[2] > 0 ? 1 : 0;
    630    dst->i[3] = src->i[3] < 0 ? -1 : src->i[3] > 0 ? 1 : 0;
    631 }
    632 
    633 static void
    634 micro_sgt(union tgsi_exec_channel *dst,
    635           const union tgsi_exec_channel *src0,
    636           const union tgsi_exec_channel *src1)
    637 {
    638    dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
    639    dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
    640    dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
    641    dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
    642 }
    643 
    644 static void
    645 micro_sin(union tgsi_exec_channel *dst,
    646           const union tgsi_exec_channel *src)
    647 {
    648    dst->f[0] = sinf(src->f[0]);
    649    dst->f[1] = sinf(src->f[1]);
    650    dst->f[2] = sinf(src->f[2]);
    651    dst->f[3] = sinf(src->f[3]);
    652 }
    653 
    654 static void
    655 micro_sle(union tgsi_exec_channel *dst,
    656           const union tgsi_exec_channel *src0,
    657           const union tgsi_exec_channel *src1)
    658 {
    659    dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
    660    dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
    661    dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
    662    dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
    663 }
    664 
    665 static void
    666 micro_slt(union tgsi_exec_channel *dst,
    667           const union tgsi_exec_channel *src0,
    668           const union tgsi_exec_channel *src1)
    669 {
    670    dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
    671    dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
    672    dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
    673    dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
    674 }
    675 
    676 static void
    677 micro_sne(union tgsi_exec_channel *dst,
    678           const union tgsi_exec_channel *src0,
    679           const union tgsi_exec_channel *src1)
    680 {
    681    dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
    682    dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
    683    dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
    684    dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
    685 }
    686 
    687 static void
    688 micro_trunc(union tgsi_exec_channel *dst,
    689             const union tgsi_exec_channel *src)
    690 {
    691    dst->f[0] = truncf(src->f[0]);
    692    dst->f[1] = truncf(src->f[1]);
    693    dst->f[2] = truncf(src->f[2]);
    694    dst->f[3] = truncf(src->f[3]);
    695 }
    696 
    697 static void
    698 micro_u2d(union tgsi_double_channel *dst,
    699           const union tgsi_exec_channel *src)
    700 {
    701    dst->d[0] = (double)src->u[0];
    702    dst->d[1] = (double)src->u[1];
    703    dst->d[2] = (double)src->u[2];
    704    dst->d[3] = (double)src->u[3];
    705 }
    706 
    707 static void
    708 micro_i64abs(union tgsi_double_channel *dst,
    709              const union tgsi_double_channel *src)
    710 {
    711    dst->i64[0] = src->i64[0] >= 0.0 ? src->i64[0] : -src->i64[0];
    712    dst->i64[1] = src->i64[1] >= 0.0 ? src->i64[1] : -src->i64[1];
    713    dst->i64[2] = src->i64[2] >= 0.0 ? src->i64[2] : -src->i64[2];
    714    dst->i64[3] = src->i64[3] >= 0.0 ? src->i64[3] : -src->i64[3];
    715 }
    716 
    717 static void
    718 micro_i64sgn(union tgsi_double_channel *dst,
    719              const union tgsi_double_channel *src)
    720 {
    721    dst->i64[0] = src->i64[0] < 0 ? -1 : src->i64[0] > 0 ? 1 : 0;
    722    dst->i64[1] = src->i64[1] < 0 ? -1 : src->i64[1] > 0 ? 1 : 0;
    723    dst->i64[2] = src->i64[2] < 0 ? -1 : src->i64[2] > 0 ? 1 : 0;
    724    dst->i64[3] = src->i64[3] < 0 ? -1 : src->i64[3] > 0 ? 1 : 0;
    725 }
    726 
    727 static void
    728 micro_i64neg(union tgsi_double_channel *dst,
    729              const union tgsi_double_channel *src)
    730 {
    731    dst->i64[0] = -src->i64[0];
    732    dst->i64[1] = -src->i64[1];
    733    dst->i64[2] = -src->i64[2];
    734    dst->i64[3] = -src->i64[3];
    735 }
    736 
    737 static void
    738 micro_u64seq(union tgsi_double_channel *dst,
    739            const union tgsi_double_channel *src)
    740 {
    741    dst->u[0][0] = src[0].u64[0] == src[1].u64[0] ? ~0U : 0U;
    742    dst->u[1][0] = src[0].u64[1] == src[1].u64[1] ? ~0U : 0U;
    743    dst->u[2][0] = src[0].u64[2] == src[1].u64[2] ? ~0U : 0U;
    744    dst->u[3][0] = src[0].u64[3] == src[1].u64[3] ? ~0U : 0U;
    745 }
    746 
    747 static void
    748 micro_u64sne(union tgsi_double_channel *dst,
    749              const union tgsi_double_channel *src)
    750 {
    751    dst->u[0][0] = src[0].u64[0] != src[1].u64[0] ? ~0U : 0U;
    752    dst->u[1][0] = src[0].u64[1] != src[1].u64[1] ? ~0U : 0U;
    753    dst->u[2][0] = src[0].u64[2] != src[1].u64[2] ? ~0U : 0U;
    754    dst->u[3][0] = src[0].u64[3] != src[1].u64[3] ? ~0U : 0U;
    755 }
    756 
    757 static void
    758 micro_i64slt(union tgsi_double_channel *dst,
    759              const union tgsi_double_channel *src)
    760 {
    761    dst->u[0][0] = src[0].i64[0] < src[1].i64[0] ? ~0U : 0U;
    762    dst->u[1][0] = src[0].i64[1] < src[1].i64[1] ? ~0U : 0U;
    763    dst->u[2][0] = src[0].i64[2] < src[1].i64[2] ? ~0U : 0U;
    764    dst->u[3][0] = src[0].i64[3] < src[1].i64[3] ? ~0U : 0U;
    765 }
    766 
    767 static void
    768 micro_u64slt(union tgsi_double_channel *dst,
    769              const union tgsi_double_channel *src)
    770 {
    771    dst->u[0][0] = src[0].u64[0] < src[1].u64[0] ? ~0U : 0U;
    772    dst->u[1][0] = src[0].u64[1] < src[1].u64[1] ? ~0U : 0U;
    773    dst->u[2][0] = src[0].u64[2] < src[1].u64[2] ? ~0U : 0U;
    774    dst->u[3][0] = src[0].u64[3] < src[1].u64[3] ? ~0U : 0U;
    775 }
    776 
    777 static void
    778 micro_i64sge(union tgsi_double_channel *dst,
    779            const union tgsi_double_channel *src)
    780 {
    781    dst->u[0][0] = src[0].i64[0] >= src[1].i64[0] ? ~0U : 0U;
    782    dst->u[1][0] = src[0].i64[1] >= src[1].i64[1] ? ~0U : 0U;
    783    dst->u[2][0] = src[0].i64[2] >= src[1].i64[2] ? ~0U : 0U;
    784    dst->u[3][0] = src[0].i64[3] >= src[1].i64[3] ? ~0U : 0U;
    785 }
    786 
    787 static void
    788 micro_u64sge(union tgsi_double_channel *dst,
    789              const union tgsi_double_channel *src)
    790 {
    791    dst->u[0][0] = src[0].u64[0] >= src[1].u64[0] ? ~0U : 0U;
    792    dst->u[1][0] = src[0].u64[1] >= src[1].u64[1] ? ~0U : 0U;
    793    dst->u[2][0] = src[0].u64[2] >= src[1].u64[2] ? ~0U : 0U;
    794    dst->u[3][0] = src[0].u64[3] >= src[1].u64[3] ? ~0U : 0U;
    795 }
    796 
    797 static void
    798 micro_u64max(union tgsi_double_channel *dst,
    799              const union tgsi_double_channel *src)
    800 {
    801    dst->u64[0] = src[0].u64[0] > src[1].u64[0] ? src[0].u64[0] : src[1].u64[0];
    802    dst->u64[1] = src[0].u64[1] > src[1].u64[1] ? src[0].u64[1] : src[1].u64[1];
    803    dst->u64[2] = src[0].u64[2] > src[1].u64[2] ? src[0].u64[2] : src[1].u64[2];
    804    dst->u64[3] = src[0].u64[3] > src[1].u64[3] ? src[0].u64[3] : src[1].u64[3];
    805 }
    806 
    807 static void
    808 micro_i64max(union tgsi_double_channel *dst,
    809              const union tgsi_double_channel *src)
    810 {
    811    dst->i64[0] = src[0].i64[0] > src[1].i64[0] ? src[0].i64[0] : src[1].i64[0];
    812    dst->i64[1] = src[0].i64[1] > src[1].i64[1] ? src[0].i64[1] : src[1].i64[1];
    813    dst->i64[2] = src[0].i64[2] > src[1].i64[2] ? src[0].i64[2] : src[1].i64[2];
    814    dst->i64[3] = src[0].i64[3] > src[1].i64[3] ? src[0].i64[3] : src[1].i64[3];
    815 }
    816 
    817 static void
    818 micro_u64min(union tgsi_double_channel *dst,
    819              const union tgsi_double_channel *src)
    820 {
    821    dst->u64[0] = src[0].u64[0] < src[1].u64[0] ? src[0].u64[0] : src[1].u64[0];
    822    dst->u64[1] = src[0].u64[1] < src[1].u64[1] ? src[0].u64[1] : src[1].u64[1];
    823    dst->u64[2] = src[0].u64[2] < src[1].u64[2] ? src[0].u64[2] : src[1].u64[2];
    824    dst->u64[3] = src[0].u64[3] < src[1].u64[3] ? src[0].u64[3] : src[1].u64[3];
    825 }
    826 
    827 static void
    828 micro_i64min(union tgsi_double_channel *dst,
    829              const union tgsi_double_channel *src)
    830 {
    831    dst->i64[0] = src[0].i64[0] < src[1].i64[0] ? src[0].i64[0] : src[1].i64[0];
    832    dst->i64[1] = src[0].i64[1] < src[1].i64[1] ? src[0].i64[1] : src[1].i64[1];
    833    dst->i64[2] = src[0].i64[2] < src[1].i64[2] ? src[0].i64[2] : src[1].i64[2];
    834    dst->i64[3] = src[0].i64[3] < src[1].i64[3] ? src[0].i64[3] : src[1].i64[3];
    835 }
    836 
    837 static void
    838 micro_u64add(union tgsi_double_channel *dst,
    839              const union tgsi_double_channel *src)
    840 {
    841    dst->u64[0] = src[0].u64[0] + src[1].u64[0];
    842    dst->u64[1] = src[0].u64[1] + src[1].u64[1];
    843    dst->u64[2] = src[0].u64[2] + src[1].u64[2];
    844    dst->u64[3] = src[0].u64[3] + src[1].u64[3];
    845 }
    846 
    847 static void
    848 micro_u64mul(union tgsi_double_channel *dst,
    849              const union tgsi_double_channel *src)
    850 {
    851    dst->u64[0] = src[0].u64[0] * src[1].u64[0];
    852    dst->u64[1] = src[0].u64[1] * src[1].u64[1];
    853    dst->u64[2] = src[0].u64[2] * src[1].u64[2];
    854    dst->u64[3] = src[0].u64[3] * src[1].u64[3];
    855 }
    856 
    857 static void
    858 micro_u64div(union tgsi_double_channel *dst,
    859              const union tgsi_double_channel *src)
    860 {
    861    dst->u64[0] = src[0].u64[0] / src[1].u64[0];
    862    dst->u64[1] = src[0].u64[1] / src[1].u64[1];
    863    dst->u64[2] = src[0].u64[2] / src[1].u64[2];
    864    dst->u64[3] = src[0].u64[3] / src[1].u64[3];
    865 }
    866 
    867 static void
    868 micro_i64div(union tgsi_double_channel *dst,
    869              const union tgsi_double_channel *src)
    870 {
    871    dst->i64[0] = src[0].i64[0] / src[1].i64[0];
    872    dst->i64[1] = src[0].i64[1] / src[1].i64[1];
    873    dst->i64[2] = src[0].i64[2] / src[1].i64[2];
    874    dst->i64[3] = src[0].i64[3] / src[1].i64[3];
    875 }
    876 
    877 static void
    878 micro_u64mod(union tgsi_double_channel *dst,
    879              const union tgsi_double_channel *src)
    880 {
    881    dst->u64[0] = src[0].u64[0] % src[1].u64[0];
    882    dst->u64[1] = src[0].u64[1] % src[1].u64[1];
    883    dst->u64[2] = src[0].u64[2] % src[1].u64[2];
    884    dst->u64[3] = src[0].u64[3] % src[1].u64[3];
    885 }
    886 
    887 static void
    888 micro_i64mod(union tgsi_double_channel *dst,
    889              const union tgsi_double_channel *src)
    890 {
    891    dst->i64[0] = src[0].i64[0] % src[1].i64[0];
    892    dst->i64[1] = src[0].i64[1] % src[1].i64[1];
    893    dst->i64[2] = src[0].i64[2] % src[1].i64[2];
    894    dst->i64[3] = src[0].i64[3] % src[1].i64[3];
    895 }
    896 
    897 static void
    898 micro_u64shl(union tgsi_double_channel *dst,
    899              const union tgsi_double_channel *src0,
    900              union tgsi_exec_channel *src1)
    901 {
    902    unsigned masked_count;
    903    masked_count = src1->u[0] & 0x3f;
    904    dst->u64[0] = src0->u64[0] << masked_count;
    905    masked_count = src1->u[1] & 0x3f;
    906    dst->u64[1] = src0->u64[1] << masked_count;
    907    masked_count = src1->u[2] & 0x3f;
    908    dst->u64[2] = src0->u64[2] << masked_count;
    909    masked_count = src1->u[3] & 0x3f;
    910    dst->u64[3] = src0->u64[3] << masked_count;
    911 }
    912 
    913 static void
    914 micro_i64shr(union tgsi_double_channel *dst,
    915              const union tgsi_double_channel *src0,
    916              union tgsi_exec_channel *src1)
    917 {
    918    unsigned masked_count;
    919    masked_count = src1->u[0] & 0x3f;
    920    dst->i64[0] = src0->i64[0] >> masked_count;
    921    masked_count = src1->u[1] & 0x3f;
    922    dst->i64[1] = src0->i64[1] >> masked_count;
    923    masked_count = src1->u[2] & 0x3f;
    924    dst->i64[2] = src0->i64[2] >> masked_count;
    925    masked_count = src1->u[3] & 0x3f;
    926    dst->i64[3] = src0->i64[3] >> masked_count;
    927 }
    928 
    929 static void
    930 micro_u64shr(union tgsi_double_channel *dst,
    931              const union tgsi_double_channel *src0,
    932              union tgsi_exec_channel *src1)
    933 {
    934    unsigned masked_count;
    935    masked_count = src1->u[0] & 0x3f;
    936    dst->u64[0] = src0->u64[0] >> masked_count;
    937    masked_count = src1->u[1] & 0x3f;
    938    dst->u64[1] = src0->u64[1] >> masked_count;
    939    masked_count = src1->u[2] & 0x3f;
    940    dst->u64[2] = src0->u64[2] >> masked_count;
    941    masked_count = src1->u[3] & 0x3f;
    942    dst->u64[3] = src0->u64[3] >> masked_count;
    943 }
    944 
    945 enum tgsi_exec_datatype {
    946    TGSI_EXEC_DATA_FLOAT,
    947    TGSI_EXEC_DATA_INT,
    948    TGSI_EXEC_DATA_UINT,
    949    TGSI_EXEC_DATA_DOUBLE,
    950    TGSI_EXEC_DATA_INT64,
    951    TGSI_EXEC_DATA_UINT64,
    952 };
    953 
    954 /*
    955  * Shorthand locations of various utility registers (_I = Index, _C = Channel)
    956  */
    957 #define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
    958 #define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
    959 #define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
    960 #define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
    961 #define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
    962 #define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
    963 
    964 
    965 /** The execution mask depends on the conditional mask and the loop mask */
    966 #define UPDATE_EXEC_MASK(MACH) \
    967       MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
    968 
    969 
    970 static const union tgsi_exec_channel ZeroVec =
    971    { { 0.0, 0.0, 0.0, 0.0 } };
    972 
    973 static const union tgsi_exec_channel OneVec = {
    974    {1.0f, 1.0f, 1.0f, 1.0f}
    975 };
    976 
    977 static const union tgsi_exec_channel P128Vec = {
    978    {128.0f, 128.0f, 128.0f, 128.0f}
    979 };
    980 
    981 static const union tgsi_exec_channel M128Vec = {
    982    {-128.0f, -128.0f, -128.0f, -128.0f}
    983 };
    984 
    985 
    986 /**
    987  * Assert that none of the float values in 'chan' are infinite or NaN.
    988  * NaN and Inf may occur normally during program execution and should
    989  * not lead to crashes, etc.  But when debugging, it's helpful to catch
    990  * them.
    991  */
    992 static inline void
    993 check_inf_or_nan(const union tgsi_exec_channel *chan)
    994 {
    995    assert(!util_is_inf_or_nan((chan)->f[0]));
    996    assert(!util_is_inf_or_nan((chan)->f[1]));
    997    assert(!util_is_inf_or_nan((chan)->f[2]));
    998    assert(!util_is_inf_or_nan((chan)->f[3]));
    999 }
   1000 
   1001 
   1002 #ifdef DEBUG
   1003 static void
   1004 print_chan(const char *msg, const union tgsi_exec_channel *chan)
   1005 {
   1006    debug_printf("%s = {%f, %f, %f, %f}\n",
   1007                 msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
   1008 }
   1009 #endif
   1010 
   1011 
   1012 #ifdef DEBUG
   1013 static void
   1014 print_temp(const struct tgsi_exec_machine *mach, uint index)
   1015 {
   1016    const struct tgsi_exec_vector *tmp = &mach->Temps[index];
   1017    int i;
   1018    debug_printf("Temp[%u] =\n", index);
   1019    for (i = 0; i < 4; i++) {
   1020       debug_printf("  %c: { %f, %f, %f, %f }\n",
   1021                    "XYZW"[i],
   1022                    tmp->xyzw[i].f[0],
   1023                    tmp->xyzw[i].f[1],
   1024                    tmp->xyzw[i].f[2],
   1025                    tmp->xyzw[i].f[3]);
   1026    }
   1027 }
   1028 #endif
   1029 
   1030 
   1031 void
   1032 tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
   1033                                unsigned num_bufs,
   1034                                const void **bufs,
   1035                                const unsigned *buf_sizes)
   1036 {
   1037    unsigned i;
   1038 
   1039    for (i = 0; i < num_bufs; i++) {
   1040       mach->Consts[i] = bufs[i];
   1041       mach->ConstsSize[i] = buf_sizes[i];
   1042    }
   1043 }
   1044 
   1045 
   1046 /**
   1047  * Check if there's a potential src/dst register data dependency when
   1048  * using SOA execution.
   1049  * Example:
   1050  *   MOV T, T.yxwz;
   1051  * This would expand into:
   1052  *   MOV t0, t1;
   1053  *   MOV t1, t0;
   1054  *   MOV t2, t3;
   1055  *   MOV t3, t2;
   1056  * The second instruction will have the wrong value for t0 if executed as-is.
   1057  */
   1058 boolean
   1059 tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
   1060 {
   1061    uint i, chan;
   1062 
   1063    uint writemask = inst->Dst[0].Register.WriteMask;
   1064    if (writemask == TGSI_WRITEMASK_X ||
   1065        writemask == TGSI_WRITEMASK_Y ||
   1066        writemask == TGSI_WRITEMASK_Z ||
   1067        writemask == TGSI_WRITEMASK_W ||
   1068        writemask == TGSI_WRITEMASK_NONE) {
   1069       /* no chance of data dependency */
   1070       return FALSE;
   1071    }
   1072 
   1073    /* loop over src regs */
   1074    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
   1075       if ((inst->Src[i].Register.File ==
   1076            inst->Dst[0].Register.File) &&
   1077           ((inst->Src[i].Register.Index ==
   1078             inst->Dst[0].Register.Index) ||
   1079            inst->Src[i].Register.Indirect ||
   1080            inst->Dst[0].Register.Indirect)) {
   1081          /* loop over dest channels */
   1082          uint channelsWritten = 0x0;
   1083          for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   1084             if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   1085                /* check if we're reading a channel that's been written */
   1086                uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
   1087                if (channelsWritten & (1 << swizzle)) {
   1088                   return TRUE;
   1089                }
   1090 
   1091                channelsWritten |= (1 << chan);
   1092             }
   1093          }
   1094       }
   1095    }
   1096    return FALSE;
   1097 }
   1098 
   1099 
   1100 /**
   1101  * Initialize machine state by expanding tokens to full instructions,
   1102  * allocating temporary storage, setting up constants, etc.
   1103  * After this, we can call tgsi_exec_machine_run() many times.
   1104  */
   1105 void
   1106 tgsi_exec_machine_bind_shader(
   1107    struct tgsi_exec_machine *mach,
   1108    const struct tgsi_token *tokens,
   1109    struct tgsi_sampler *sampler,
   1110    struct tgsi_image *image,
   1111    struct tgsi_buffer *buffer)
   1112 {
   1113    uint k;
   1114    struct tgsi_parse_context parse;
   1115    struct tgsi_full_instruction *instructions;
   1116    struct tgsi_full_declaration *declarations;
   1117    uint maxInstructions = 10, numInstructions = 0;
   1118    uint maxDeclarations = 10, numDeclarations = 0;
   1119 
   1120 #if 0
   1121    tgsi_dump(tokens, 0);
   1122 #endif
   1123 
   1124    util_init_math();
   1125 
   1126 
   1127    mach->Tokens = tokens;
   1128    mach->Sampler = sampler;
   1129    mach->Image = image;
   1130    mach->Buffer = buffer;
   1131 
   1132    if (!tokens) {
   1133       /* unbind and free all */
   1134       FREE(mach->Declarations);
   1135       mach->Declarations = NULL;
   1136       mach->NumDeclarations = 0;
   1137 
   1138       FREE(mach->Instructions);
   1139       mach->Instructions = NULL;
   1140       mach->NumInstructions = 0;
   1141 
   1142       return;
   1143    }
   1144 
   1145    k = tgsi_parse_init (&parse, mach->Tokens);
   1146    if (k != TGSI_PARSE_OK) {
   1147       debug_printf( "Problem parsing!\n" );
   1148       return;
   1149    }
   1150 
   1151    mach->ImmLimit = 0;
   1152    mach->NumOutputs = 0;
   1153 
   1154    for (k = 0; k < TGSI_SEMANTIC_COUNT; k++)
   1155       mach->SysSemanticToIndex[k] = -1;
   1156 
   1157    if (mach->ShaderType == PIPE_SHADER_GEOMETRY &&
   1158        !mach->UsedGeometryShader) {
   1159       struct tgsi_exec_vector *inputs;
   1160       struct tgsi_exec_vector *outputs;
   1161 
   1162       inputs = align_malloc(sizeof(struct tgsi_exec_vector) *
   1163                             TGSI_MAX_PRIM_VERTICES * PIPE_MAX_SHADER_INPUTS,
   1164                             16);
   1165 
   1166       if (!inputs)
   1167          return;
   1168 
   1169       outputs = align_malloc(sizeof(struct tgsi_exec_vector) *
   1170                              TGSI_MAX_TOTAL_VERTICES, 16);
   1171 
   1172       if (!outputs) {
   1173          align_free(inputs);
   1174          return;
   1175       }
   1176 
   1177       align_free(mach->Inputs);
   1178       align_free(mach->Outputs);
   1179 
   1180       mach->Inputs = inputs;
   1181       mach->Outputs = outputs;
   1182       mach->UsedGeometryShader = TRUE;
   1183    }
   1184 
   1185    declarations = (struct tgsi_full_declaration *)
   1186       MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
   1187 
   1188    if (!declarations) {
   1189       return;
   1190    }
   1191 
   1192    instructions = (struct tgsi_full_instruction *)
   1193       MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
   1194 
   1195    if (!instructions) {
   1196       FREE( declarations );
   1197       return;
   1198    }
   1199 
   1200    while( !tgsi_parse_end_of_tokens( &parse ) ) {
   1201       uint i;
   1202 
   1203       tgsi_parse_token( &parse );
   1204       switch( parse.FullToken.Token.Type ) {
   1205       case TGSI_TOKEN_TYPE_DECLARATION:
   1206          /* save expanded declaration */
   1207          if (numDeclarations == maxDeclarations) {
   1208             declarations = REALLOC(declarations,
   1209                                    maxDeclarations
   1210                                    * sizeof(struct tgsi_full_declaration),
   1211                                    (maxDeclarations + 10)
   1212                                    * sizeof(struct tgsi_full_declaration));
   1213             maxDeclarations += 10;
   1214          }
   1215          if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
   1216             unsigned reg;
   1217             for (reg = parse.FullToken.FullDeclaration.Range.First;
   1218                  reg <= parse.FullToken.FullDeclaration.Range.Last;
   1219                  ++reg) {
   1220                ++mach->NumOutputs;
   1221             }
   1222          }
   1223          else if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
   1224             const struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
   1225             mach->SysSemanticToIndex[decl->Semantic.Name] = decl->Range.First;
   1226          }
   1227 
   1228          memcpy(declarations + numDeclarations,
   1229                 &parse.FullToken.FullDeclaration,
   1230                 sizeof(declarations[0]));
   1231          numDeclarations++;
   1232          break;
   1233 
   1234       case TGSI_TOKEN_TYPE_IMMEDIATE:
   1235          {
   1236             uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
   1237             assert( size <= 4 );
   1238             assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
   1239 
   1240             for( i = 0; i < size; i++ ) {
   1241                mach->Imms[mach->ImmLimit][i] =
   1242 		  parse.FullToken.FullImmediate.u[i].Float;
   1243             }
   1244             mach->ImmLimit += 1;
   1245          }
   1246          break;
   1247 
   1248       case TGSI_TOKEN_TYPE_INSTRUCTION:
   1249 
   1250          /* save expanded instruction */
   1251          if (numInstructions == maxInstructions) {
   1252             instructions = REALLOC(instructions,
   1253                                    maxInstructions
   1254                                    * sizeof(struct tgsi_full_instruction),
   1255                                    (maxInstructions + 10)
   1256                                    * sizeof(struct tgsi_full_instruction));
   1257             maxInstructions += 10;
   1258          }
   1259 
   1260          memcpy(instructions + numInstructions,
   1261                 &parse.FullToken.FullInstruction,
   1262                 sizeof(instructions[0]));
   1263 
   1264          numInstructions++;
   1265          break;
   1266 
   1267       case TGSI_TOKEN_TYPE_PROPERTY:
   1268          if (mach->ShaderType == PIPE_SHADER_GEOMETRY) {
   1269             if (parse.FullToken.FullProperty.Property.PropertyName == TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES) {
   1270                mach->MaxOutputVertices = parse.FullToken.FullProperty.u[0].Data;
   1271             }
   1272          }
   1273          break;
   1274 
   1275       default:
   1276          assert( 0 );
   1277       }
   1278    }
   1279    tgsi_parse_free (&parse);
   1280 
   1281    FREE(mach->Declarations);
   1282    mach->Declarations = declarations;
   1283    mach->NumDeclarations = numDeclarations;
   1284 
   1285    FREE(mach->Instructions);
   1286    mach->Instructions = instructions;
   1287    mach->NumInstructions = numInstructions;
   1288 }
   1289 
   1290 
   1291 struct tgsi_exec_machine *
   1292 tgsi_exec_machine_create(enum pipe_shader_type shader_type)
   1293 {
   1294    struct tgsi_exec_machine *mach;
   1295    uint i;
   1296 
   1297    mach = align_malloc( sizeof *mach, 16 );
   1298    if (!mach)
   1299       goto fail;
   1300 
   1301    memset(mach, 0, sizeof(*mach));
   1302 
   1303    mach->ShaderType = shader_type;
   1304    mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
   1305    mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
   1306    mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
   1307 
   1308    if (shader_type != PIPE_SHADER_COMPUTE) {
   1309       mach->Inputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_SHADER_INPUTS, 16);
   1310       mach->Outputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_SHADER_OUTPUTS, 16);
   1311       if (!mach->Inputs || !mach->Outputs)
   1312          goto fail;
   1313    }
   1314 
   1315    /* Setup constants needed by the SSE2 executor. */
   1316    for( i = 0; i < 4; i++ ) {
   1317       mach->Temps[TGSI_EXEC_TEMP_00000000_I].xyzw[TGSI_EXEC_TEMP_00000000_C].u[i] = 0x00000000;
   1318       mach->Temps[TGSI_EXEC_TEMP_7FFFFFFF_I].xyzw[TGSI_EXEC_TEMP_7FFFFFFF_C].u[i] = 0x7FFFFFFF;
   1319       mach->Temps[TGSI_EXEC_TEMP_80000000_I].xyzw[TGSI_EXEC_TEMP_80000000_C].u[i] = 0x80000000;
   1320       mach->Temps[TGSI_EXEC_TEMP_FFFFFFFF_I].xyzw[TGSI_EXEC_TEMP_FFFFFFFF_C].u[i] = 0xFFFFFFFF;    /* not used */
   1321       mach->Temps[TGSI_EXEC_TEMP_ONE_I].xyzw[TGSI_EXEC_TEMP_ONE_C].f[i] = 1.0f;
   1322       mach->Temps[TGSI_EXEC_TEMP_TWO_I].xyzw[TGSI_EXEC_TEMP_TWO_C].f[i] = 2.0f;    /* not used */
   1323       mach->Temps[TGSI_EXEC_TEMP_128_I].xyzw[TGSI_EXEC_TEMP_128_C].f[i] = 128.0f;
   1324       mach->Temps[TGSI_EXEC_TEMP_MINUS_128_I].xyzw[TGSI_EXEC_TEMP_MINUS_128_C].f[i] = -128.0f;
   1325       mach->Temps[TGSI_EXEC_TEMP_THREE_I].xyzw[TGSI_EXEC_TEMP_THREE_C].f[i] = 3.0f;
   1326       mach->Temps[TGSI_EXEC_TEMP_HALF_I].xyzw[TGSI_EXEC_TEMP_HALF_C].f[i] = 0.5f;
   1327    }
   1328 
   1329 #ifdef DEBUG
   1330    /* silence warnings */
   1331    (void) print_chan;
   1332    (void) print_temp;
   1333 #endif
   1334 
   1335    return mach;
   1336 
   1337 fail:
   1338    if (mach) {
   1339       align_free(mach->Inputs);
   1340       align_free(mach->Outputs);
   1341       align_free(mach);
   1342    }
   1343    return NULL;
   1344 }
   1345 
   1346 
   1347 void
   1348 tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
   1349 {
   1350    if (mach) {
   1351       FREE(mach->Instructions);
   1352       FREE(mach->Declarations);
   1353 
   1354       align_free(mach->Inputs);
   1355       align_free(mach->Outputs);
   1356 
   1357       align_free(mach);
   1358    }
   1359 }
   1360 
   1361 static void
   1362 micro_add(union tgsi_exec_channel *dst,
   1363           const union tgsi_exec_channel *src0,
   1364           const union tgsi_exec_channel *src1)
   1365 {
   1366    dst->f[0] = src0->f[0] + src1->f[0];
   1367    dst->f[1] = src0->f[1] + src1->f[1];
   1368    dst->f[2] = src0->f[2] + src1->f[2];
   1369    dst->f[3] = src0->f[3] + src1->f[3];
   1370 }
   1371 
   1372 static void
   1373 micro_div(
   1374    union tgsi_exec_channel *dst,
   1375    const union tgsi_exec_channel *src0,
   1376    const union tgsi_exec_channel *src1 )
   1377 {
   1378    if (src1->f[0] != 0) {
   1379       dst->f[0] = src0->f[0] / src1->f[0];
   1380    }
   1381    if (src1->f[1] != 0) {
   1382       dst->f[1] = src0->f[1] / src1->f[1];
   1383    }
   1384    if (src1->f[2] != 0) {
   1385       dst->f[2] = src0->f[2] / src1->f[2];
   1386    }
   1387    if (src1->f[3] != 0) {
   1388       dst->f[3] = src0->f[3] / src1->f[3];
   1389    }
   1390 }
   1391 
   1392 static void
   1393 micro_lt(
   1394    union tgsi_exec_channel *dst,
   1395    const union tgsi_exec_channel *src0,
   1396    const union tgsi_exec_channel *src1,
   1397    const union tgsi_exec_channel *src2,
   1398    const union tgsi_exec_channel *src3 )
   1399 {
   1400    dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
   1401    dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
   1402    dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
   1403    dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
   1404 }
   1405 
   1406 static void
   1407 micro_max(union tgsi_exec_channel *dst,
   1408           const union tgsi_exec_channel *src0,
   1409           const union tgsi_exec_channel *src1)
   1410 {
   1411    dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
   1412    dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
   1413    dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
   1414    dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
   1415 }
   1416 
   1417 static void
   1418 micro_min(union tgsi_exec_channel *dst,
   1419           const union tgsi_exec_channel *src0,
   1420           const union tgsi_exec_channel *src1)
   1421 {
   1422    dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
   1423    dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
   1424    dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
   1425    dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
   1426 }
   1427 
   1428 static void
   1429 micro_mul(union tgsi_exec_channel *dst,
   1430           const union tgsi_exec_channel *src0,
   1431           const union tgsi_exec_channel *src1)
   1432 {
   1433    dst->f[0] = src0->f[0] * src1->f[0];
   1434    dst->f[1] = src0->f[1] * src1->f[1];
   1435    dst->f[2] = src0->f[2] * src1->f[2];
   1436    dst->f[3] = src0->f[3] * src1->f[3];
   1437 }
   1438 
   1439 static void
   1440 micro_neg(
   1441    union tgsi_exec_channel *dst,
   1442    const union tgsi_exec_channel *src )
   1443 {
   1444    dst->f[0] = -src->f[0];
   1445    dst->f[1] = -src->f[1];
   1446    dst->f[2] = -src->f[2];
   1447    dst->f[3] = -src->f[3];
   1448 }
   1449 
   1450 static void
   1451 micro_pow(
   1452    union tgsi_exec_channel *dst,
   1453    const union tgsi_exec_channel *src0,
   1454    const union tgsi_exec_channel *src1 )
   1455 {
   1456 #if FAST_MATH
   1457    dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
   1458    dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
   1459    dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
   1460    dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
   1461 #else
   1462    dst->f[0] = powf( src0->f[0], src1->f[0] );
   1463    dst->f[1] = powf( src0->f[1], src1->f[1] );
   1464    dst->f[2] = powf( src0->f[2], src1->f[2] );
   1465    dst->f[3] = powf( src0->f[3], src1->f[3] );
   1466 #endif
   1467 }
   1468 
   1469 static void
   1470 micro_sub(union tgsi_exec_channel *dst,
   1471           const union tgsi_exec_channel *src0,
   1472           const union tgsi_exec_channel *src1)
   1473 {
   1474    dst->f[0] = src0->f[0] - src1->f[0];
   1475    dst->f[1] = src0->f[1] - src1->f[1];
   1476    dst->f[2] = src0->f[2] - src1->f[2];
   1477    dst->f[3] = src0->f[3] - src1->f[3];
   1478 }
   1479 
   1480 static void
   1481 fetch_src_file_channel(const struct tgsi_exec_machine *mach,
   1482                        const uint chan_index,
   1483                        const uint file,
   1484                        const uint swizzle,
   1485                        const union tgsi_exec_channel *index,
   1486                        const union tgsi_exec_channel *index2D,
   1487                        union tgsi_exec_channel *chan)
   1488 {
   1489    uint i;
   1490 
   1491    assert(swizzle < 4);
   1492 
   1493    switch (file) {
   1494    case TGSI_FILE_CONSTANT:
   1495       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1496          assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
   1497          assert(mach->Consts[index2D->i[i]]);
   1498 
   1499          if (index->i[i] < 0) {
   1500             chan->u[i] = 0;
   1501          } else {
   1502             /* NOTE: copying the const value as a uint instead of float */
   1503             const uint constbuf = index2D->i[i];
   1504             const uint *buf = (const uint *)mach->Consts[constbuf];
   1505             const int pos = index->i[i] * 4 + swizzle;
   1506             /* const buffer bounds check */
   1507             if (pos < 0 || pos >= (int) mach->ConstsSize[constbuf]) {
   1508                if (0) {
   1509                   /* Debug: print warning */
   1510                   static int count = 0;
   1511                   if (count++ < 100)
   1512                      debug_printf("TGSI Exec: const buffer index %d"
   1513                                   " out of bounds\n", pos);
   1514                }
   1515                chan->u[i] = 0;
   1516             }
   1517             else
   1518                chan->u[i] = buf[pos];
   1519          }
   1520       }
   1521       break;
   1522 
   1523    case TGSI_FILE_INPUT:
   1524       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1525          /*
   1526          if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
   1527             debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
   1528                          index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
   1529                          index2D->i[i], index->i[i]);
   1530                          }*/
   1531          int pos = index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i];
   1532          assert(pos >= 0);
   1533          assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
   1534          chan->u[i] = mach->Inputs[pos].xyzw[swizzle].u[i];
   1535       }
   1536       break;
   1537 
   1538    case TGSI_FILE_SYSTEM_VALUE:
   1539       /* XXX no swizzling at this point.  Will be needed if we put
   1540        * gl_FragCoord, for example, in a sys value register.
   1541        */
   1542       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1543          chan->u[i] = mach->SystemValue[index->i[i]].xyzw[swizzle].u[i];
   1544       }
   1545       break;
   1546 
   1547    case TGSI_FILE_TEMPORARY:
   1548       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1549          assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
   1550          assert(index2D->i[i] == 0);
   1551 
   1552          chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
   1553       }
   1554       break;
   1555 
   1556    case TGSI_FILE_IMMEDIATE:
   1557       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1558          assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
   1559          assert(index2D->i[i] == 0);
   1560 
   1561          chan->f[i] = mach->Imms[index->i[i]][swizzle];
   1562       }
   1563       break;
   1564 
   1565    case TGSI_FILE_ADDRESS:
   1566       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1567          assert(index->i[i] >= 0);
   1568          assert(index2D->i[i] == 0);
   1569 
   1570          chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
   1571       }
   1572       break;
   1573 
   1574    case TGSI_FILE_PREDICATE:
   1575       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1576          assert(index->i[i] >= 0 && index->i[i] < TGSI_EXEC_NUM_PREDS);
   1577          assert(index2D->i[i] == 0);
   1578 
   1579          chan->u[i] = mach->Predicates[0].xyzw[swizzle].u[i];
   1580       }
   1581       break;
   1582 
   1583    case TGSI_FILE_OUTPUT:
   1584       /* vertex/fragment output vars can be read too */
   1585       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1586          assert(index->i[i] >= 0);
   1587          assert(index2D->i[i] == 0);
   1588 
   1589          chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
   1590       }
   1591       break;
   1592 
   1593    default:
   1594       assert(0);
   1595       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1596          chan->u[i] = 0;
   1597       }
   1598    }
   1599 }
   1600 
   1601 static void
   1602 fetch_source_d(const struct tgsi_exec_machine *mach,
   1603                union tgsi_exec_channel *chan,
   1604                const struct tgsi_full_src_register *reg,
   1605                const uint chan_index,
   1606                enum tgsi_exec_datatype src_datatype)
   1607 {
   1608    union tgsi_exec_channel index;
   1609    union tgsi_exec_channel index2D;
   1610    uint swizzle;
   1611 
   1612    /* We start with a direct index into a register file.
   1613     *
   1614     *    file[1],
   1615     *    where:
   1616     *       file = Register.File
   1617     *       [1] = Register.Index
   1618     */
   1619    index.i[0] =
   1620    index.i[1] =
   1621    index.i[2] =
   1622    index.i[3] = reg->Register.Index;
   1623 
   1624    /* There is an extra source register that indirectly subscripts
   1625     * a register file. The direct index now becomes an offset
   1626     * that is being added to the indirect register.
   1627     *
   1628     *    file[ind[2].x+1],
   1629     *    where:
   1630     *       ind = Indirect.File
   1631     *       [2] = Indirect.Index
   1632     *       .x = Indirect.SwizzleX
   1633     */
   1634    if (reg->Register.Indirect) {
   1635       union tgsi_exec_channel index2;
   1636       union tgsi_exec_channel indir_index;
   1637       const uint execmask = mach->ExecMask;
   1638       uint i;
   1639 
   1640       /* which address register (always zero now) */
   1641       index2.i[0] =
   1642       index2.i[1] =
   1643       index2.i[2] =
   1644       index2.i[3] = reg->Indirect.Index;
   1645       /* get current value of address register[swizzle] */
   1646       swizzle = reg->Indirect.Swizzle;
   1647       fetch_src_file_channel(mach,
   1648                              chan_index,
   1649                              reg->Indirect.File,
   1650                              swizzle,
   1651                              &index2,
   1652                              &ZeroVec,
   1653                              &indir_index);
   1654 
   1655       /* add value of address register to the offset */
   1656       index.i[0] += indir_index.i[0];
   1657       index.i[1] += indir_index.i[1];
   1658       index.i[2] += indir_index.i[2];
   1659       index.i[3] += indir_index.i[3];
   1660 
   1661       /* for disabled execution channels, zero-out the index to
   1662        * avoid using a potential garbage value.
   1663        */
   1664       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1665          if ((execmask & (1 << i)) == 0)
   1666             index.i[i] = 0;
   1667       }
   1668    }
   1669 
   1670    /* There is an extra source register that is a second
   1671     * subscript to a register file. Effectively it means that
   1672     * the register file is actually a 2D array of registers.
   1673     *
   1674     *    file[3][1],
   1675     *    where:
   1676     *       [3] = Dimension.Index
   1677     */
   1678    if (reg->Register.Dimension) {
   1679       index2D.i[0] =
   1680       index2D.i[1] =
   1681       index2D.i[2] =
   1682       index2D.i[3] = reg->Dimension.Index;
   1683 
   1684       /* Again, the second subscript index can be addressed indirectly
   1685        * identically to the first one.
   1686        * Nothing stops us from indirectly addressing the indirect register,
   1687        * but there is no need for that, so we won't exercise it.
   1688        *
   1689        *    file[ind[4].y+3][1],
   1690        *    where:
   1691        *       ind = DimIndirect.File
   1692        *       [4] = DimIndirect.Index
   1693        *       .y = DimIndirect.SwizzleX
   1694        */
   1695       if (reg->Dimension.Indirect) {
   1696          union tgsi_exec_channel index2;
   1697          union tgsi_exec_channel indir_index;
   1698          const uint execmask = mach->ExecMask;
   1699          uint i;
   1700 
   1701          index2.i[0] =
   1702          index2.i[1] =
   1703          index2.i[2] =
   1704          index2.i[3] = reg->DimIndirect.Index;
   1705 
   1706          swizzle = reg->DimIndirect.Swizzle;
   1707          fetch_src_file_channel(mach,
   1708                                 chan_index,
   1709                                 reg->DimIndirect.File,
   1710                                 swizzle,
   1711                                 &index2,
   1712                                 &ZeroVec,
   1713                                 &indir_index);
   1714 
   1715          index2D.i[0] += indir_index.i[0];
   1716          index2D.i[1] += indir_index.i[1];
   1717          index2D.i[2] += indir_index.i[2];
   1718          index2D.i[3] += indir_index.i[3];
   1719 
   1720          /* for disabled execution channels, zero-out the index to
   1721           * avoid using a potential garbage value.
   1722           */
   1723          for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1724             if ((execmask & (1 << i)) == 0) {
   1725                index2D.i[i] = 0;
   1726             }
   1727          }
   1728       }
   1729 
   1730       /* If by any chance there was a need for a 3D array of register
   1731        * files, we would have to check whether Dimension is followed
   1732        * by a dimension register and continue the saga.
   1733        */
   1734    } else {
   1735       index2D.i[0] =
   1736       index2D.i[1] =
   1737       index2D.i[2] =
   1738       index2D.i[3] = 0;
   1739    }
   1740 
   1741    swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
   1742    fetch_src_file_channel(mach,
   1743                           chan_index,
   1744                           reg->Register.File,
   1745                           swizzle,
   1746                           &index,
   1747                           &index2D,
   1748                           chan);
   1749 }
   1750 
   1751 static void
   1752 fetch_source(const struct tgsi_exec_machine *mach,
   1753              union tgsi_exec_channel *chan,
   1754              const struct tgsi_full_src_register *reg,
   1755              const uint chan_index,
   1756              enum tgsi_exec_datatype src_datatype)
   1757 {
   1758    fetch_source_d(mach, chan, reg, chan_index, src_datatype);
   1759 
   1760    if (reg->Register.Absolute) {
   1761       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
   1762          micro_abs(chan, chan);
   1763       } else {
   1764          micro_iabs(chan, chan);
   1765       }
   1766    }
   1767 
   1768    if (reg->Register.Negate) {
   1769       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
   1770          micro_neg(chan, chan);
   1771       } else {
   1772          micro_ineg(chan, chan);
   1773       }
   1774    }
   1775 }
   1776 
   1777 static union tgsi_exec_channel *
   1778 store_dest_dstret(struct tgsi_exec_machine *mach,
   1779                  const union tgsi_exec_channel *chan,
   1780                  const struct tgsi_full_dst_register *reg,
   1781                  const struct tgsi_full_instruction *inst,
   1782                  uint chan_index,
   1783                  enum tgsi_exec_datatype dst_datatype)
   1784 {
   1785    uint i;
   1786    static union tgsi_exec_channel null;
   1787    union tgsi_exec_channel *dst;
   1788    union tgsi_exec_channel index2D;
   1789    uint execmask = mach->ExecMask;
   1790    int offset = 0;  /* indirection offset */
   1791    int index;
   1792 
   1793    /* for debugging */
   1794    if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
   1795       check_inf_or_nan(chan);
   1796    }
   1797 
   1798    /* There is an extra source register that indirectly subscripts
   1799     * a register file. The direct index now becomes an offset
   1800     * that is being added to the indirect register.
   1801     *
   1802     *    file[ind[2].x+1],
   1803     *    where:
   1804     *       ind = Indirect.File
   1805     *       [2] = Indirect.Index
   1806     *       .x = Indirect.SwizzleX
   1807     */
   1808    if (reg->Register.Indirect) {
   1809       union tgsi_exec_channel index;
   1810       union tgsi_exec_channel indir_index;
   1811       uint swizzle;
   1812 
   1813       /* which address register (always zero for now) */
   1814       index.i[0] =
   1815       index.i[1] =
   1816       index.i[2] =
   1817       index.i[3] = reg->Indirect.Index;
   1818 
   1819       /* get current value of address register[swizzle] */
   1820       swizzle = reg->Indirect.Swizzle;
   1821 
   1822       /* fetch values from the address/indirection register */
   1823       fetch_src_file_channel(mach,
   1824                              chan_index,
   1825                              reg->Indirect.File,
   1826                              swizzle,
   1827                              &index,
   1828                              &ZeroVec,
   1829                              &indir_index);
   1830 
   1831       /* save indirection offset */
   1832       offset = indir_index.i[0];
   1833    }
   1834 
   1835    /* There is an extra source register that is a second
   1836     * subscript to a register file. Effectively it means that
   1837     * the register file is actually a 2D array of registers.
   1838     *
   1839     *    file[3][1],
   1840     *    where:
   1841     *       [3] = Dimension.Index
   1842     */
   1843    if (reg->Register.Dimension) {
   1844       index2D.i[0] =
   1845       index2D.i[1] =
   1846       index2D.i[2] =
   1847       index2D.i[3] = reg->Dimension.Index;
   1848 
   1849       /* Again, the second subscript index can be addressed indirectly
   1850        * identically to the first one.
   1851        * Nothing stops us from indirectly addressing the indirect register,
   1852        * but there is no need for that, so we won't exercise it.
   1853        *
   1854        *    file[ind[4].y+3][1],
   1855        *    where:
   1856        *       ind = DimIndirect.File
   1857        *       [4] = DimIndirect.Index
   1858        *       .y = DimIndirect.SwizzleX
   1859        */
   1860       if (reg->Dimension.Indirect) {
   1861          union tgsi_exec_channel index2;
   1862          union tgsi_exec_channel indir_index;
   1863          const uint execmask = mach->ExecMask;
   1864          unsigned swizzle;
   1865          uint i;
   1866 
   1867          index2.i[0] =
   1868          index2.i[1] =
   1869          index2.i[2] =
   1870          index2.i[3] = reg->DimIndirect.Index;
   1871 
   1872          swizzle = reg->DimIndirect.Swizzle;
   1873          fetch_src_file_channel(mach,
   1874                                 chan_index,
   1875                                 reg->DimIndirect.File,
   1876                                 swizzle,
   1877                                 &index2,
   1878                                 &ZeroVec,
   1879                                 &indir_index);
   1880 
   1881          index2D.i[0] += indir_index.i[0];
   1882          index2D.i[1] += indir_index.i[1];
   1883          index2D.i[2] += indir_index.i[2];
   1884          index2D.i[3] += indir_index.i[3];
   1885 
   1886          /* for disabled execution channels, zero-out the index to
   1887           * avoid using a potential garbage value.
   1888           */
   1889          for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1890             if ((execmask & (1 << i)) == 0) {
   1891                index2D.i[i] = 0;
   1892             }
   1893          }
   1894       }
   1895 
   1896       /* If by any chance there was a need for a 3D array of register
   1897        * files, we would have to check whether Dimension is followed
   1898        * by a dimension register and continue the saga.
   1899        */
   1900    } else {
   1901       index2D.i[0] =
   1902       index2D.i[1] =
   1903       index2D.i[2] =
   1904       index2D.i[3] = 0;
   1905    }
   1906 
   1907    switch (reg->Register.File) {
   1908    case TGSI_FILE_NULL:
   1909       dst = &null;
   1910       break;
   1911 
   1912    case TGSI_FILE_OUTPUT:
   1913       index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
   1914          + reg->Register.Index;
   1915       dst = &mach->Outputs[offset + index].xyzw[chan_index];
   1916 #if 0
   1917       debug_printf("NumOutputs = %d, TEMP_O_C/I = %d, redindex = %d\n",
   1918                    mach->NumOutputs, mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0],
   1919                    reg->Register.Index);
   1920       if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
   1921          debug_printf("STORING OUT[%d] mask(%d), = (", offset + index, execmask);
   1922          for (i = 0; i < TGSI_QUAD_SIZE; i++)
   1923             if (execmask & (1 << i))
   1924                debug_printf("%f, ", chan->f[i]);
   1925          debug_printf(")\n");
   1926       }
   1927 #endif
   1928       break;
   1929 
   1930    case TGSI_FILE_TEMPORARY:
   1931       index = reg->Register.Index;
   1932       assert( index < TGSI_EXEC_NUM_TEMPS );
   1933       dst = &mach->Temps[offset + index].xyzw[chan_index];
   1934       break;
   1935 
   1936    case TGSI_FILE_ADDRESS:
   1937       index = reg->Register.Index;
   1938       dst = &mach->Addrs[index].xyzw[chan_index];
   1939       break;
   1940 
   1941    case TGSI_FILE_PREDICATE:
   1942       index = reg->Register.Index;
   1943       assert(index < TGSI_EXEC_NUM_PREDS);
   1944       dst = &mach->Predicates[index].xyzw[chan_index];
   1945       break;
   1946 
   1947    default:
   1948       assert( 0 );
   1949       return NULL;
   1950    }
   1951 
   1952    if (inst->Instruction.Predicate) {
   1953       uint swizzle;
   1954       union tgsi_exec_channel *pred;
   1955 
   1956       switch (chan_index) {
   1957       case TGSI_CHAN_X:
   1958          swizzle = inst->Predicate.SwizzleX;
   1959          break;
   1960       case TGSI_CHAN_Y:
   1961          swizzle = inst->Predicate.SwizzleY;
   1962          break;
   1963       case TGSI_CHAN_Z:
   1964          swizzle = inst->Predicate.SwizzleZ;
   1965          break;
   1966       case TGSI_CHAN_W:
   1967          swizzle = inst->Predicate.SwizzleW;
   1968          break;
   1969       default:
   1970          assert(0);
   1971          return NULL;
   1972       }
   1973 
   1974       assert(inst->Predicate.Index == 0);
   1975 
   1976       pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
   1977 
   1978       if (inst->Predicate.Negate) {
   1979          for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1980             if (pred->u[i]) {
   1981                execmask &= ~(1 << i);
   1982             }
   1983          }
   1984       } else {
   1985          for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1986             if (!pred->u[i]) {
   1987                execmask &= ~(1 << i);
   1988             }
   1989          }
   1990       }
   1991    }
   1992 
   1993    return dst;
   1994 }
   1995 
   1996 static void
   1997 store_dest_double(struct tgsi_exec_machine *mach,
   1998                  const union tgsi_exec_channel *chan,
   1999                  const struct tgsi_full_dst_register *reg,
   2000                  const struct tgsi_full_instruction *inst,
   2001                  uint chan_index,
   2002                  enum tgsi_exec_datatype dst_datatype)
   2003 {
   2004    union tgsi_exec_channel *dst;
   2005    const uint execmask = mach->ExecMask;
   2006    int i;
   2007 
   2008    dst = store_dest_dstret(mach, chan, reg, inst, chan_index,
   2009 			   dst_datatype);
   2010    if (!dst)
   2011       return;
   2012 
   2013    /* doubles path */
   2014    for (i = 0; i < TGSI_QUAD_SIZE; i++)
   2015       if (execmask & (1 << i))
   2016          dst->i[i] = chan->i[i];
   2017 }
   2018 
   2019 static void
   2020 store_dest(struct tgsi_exec_machine *mach,
   2021            const union tgsi_exec_channel *chan,
   2022            const struct tgsi_full_dst_register *reg,
   2023            const struct tgsi_full_instruction *inst,
   2024            uint chan_index,
   2025            enum tgsi_exec_datatype dst_datatype)
   2026 {
   2027    union tgsi_exec_channel *dst;
   2028    const uint execmask = mach->ExecMask;
   2029    int i;
   2030 
   2031    dst = store_dest_dstret(mach, chan, reg, inst, chan_index,
   2032                     dst_datatype);
   2033    if (!dst)
   2034       return;
   2035 
   2036    if (!inst->Instruction.Saturate) {
   2037       for (i = 0; i < TGSI_QUAD_SIZE; i++)
   2038          if (execmask & (1 << i))
   2039             dst->i[i] = chan->i[i];
   2040    }
   2041    else {
   2042       for (i = 0; i < TGSI_QUAD_SIZE; i++)
   2043          if (execmask & (1 << i)) {
   2044             if (chan->f[i] < 0.0f)
   2045                dst->f[i] = 0.0f;
   2046             else if (chan->f[i] > 1.0f)
   2047                dst->f[i] = 1.0f;
   2048             else
   2049                dst->i[i] = chan->i[i];
   2050          }
   2051    }
   2052 }
   2053 
   2054 #define FETCH(VAL,INDEX,CHAN)\
   2055     fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
   2056 
   2057 #define IFETCH(VAL,INDEX,CHAN)\
   2058     fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_INT)
   2059 
   2060 
   2061 /**
   2062  * Execute ARB-style KIL which is predicated by a src register.
   2063  * Kill fragment if any of the four values is less than zero.
   2064  */
   2065 static void
   2066 exec_kill_if(struct tgsi_exec_machine *mach,
   2067              const struct tgsi_full_instruction *inst)
   2068 {
   2069    uint uniquemask;
   2070    uint chan_index;
   2071    uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
   2072    union tgsi_exec_channel r[1];
   2073 
   2074    /* This mask stores component bits that were already tested. */
   2075    uniquemask = 0;
   2076 
   2077    for (chan_index = 0; chan_index < 4; chan_index++)
   2078    {
   2079       uint swizzle;
   2080       uint i;
   2081 
   2082       /* unswizzle channel */
   2083       swizzle = tgsi_util_get_full_src_register_swizzle (
   2084                         &inst->Src[0],
   2085                         chan_index);
   2086 
   2087       /* check if the component has not been already tested */
   2088       if (uniquemask & (1 << swizzle))
   2089          continue;
   2090       uniquemask |= 1 << swizzle;
   2091 
   2092       FETCH(&r[0], 0, chan_index);
   2093       for (i = 0; i < 4; i++)
   2094          if (r[0].f[i] < 0.0f)
   2095             kilmask |= 1 << i;
   2096    }
   2097 
   2098    /* restrict to fragments currently executing */
   2099    kilmask &= mach->ExecMask;
   2100 
   2101    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
   2102 }
   2103 
   2104 /**
   2105  * Unconditional fragment kill/discard.
   2106  */
   2107 static void
   2108 exec_kill(struct tgsi_exec_machine *mach,
   2109           const struct tgsi_full_instruction *inst)
   2110 {
   2111    uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
   2112 
   2113    /* kill fragment for all fragments currently executing */
   2114    kilmask = mach->ExecMask;
   2115    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
   2116 }
   2117 
   2118 static void
   2119 emit_vertex(struct tgsi_exec_machine *mach)
   2120 {
   2121    /* FIXME: check for exec mask correctly
   2122    unsigned i;
   2123    for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
   2124          if ((mach->ExecMask & (1 << i)))
   2125    */
   2126    if (mach->ExecMask) {
   2127       if (mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] >= mach->MaxOutputVertices)
   2128          return;
   2129 
   2130       mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
   2131       mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
   2132    }
   2133 }
   2134 
   2135 static void
   2136 emit_primitive(struct tgsi_exec_machine *mach)
   2137 {
   2138    unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
   2139    /* FIXME: check for exec mask correctly
   2140    unsigned i;
   2141    for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
   2142          if ((mach->ExecMask & (1 << i)))
   2143    */
   2144    if (mach->ExecMask) {
   2145       ++(*prim_count);
   2146       debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
   2147       mach->Primitives[*prim_count] = 0;
   2148    }
   2149 }
   2150 
   2151 static void
   2152 conditional_emit_primitive(struct tgsi_exec_machine *mach)
   2153 {
   2154    if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
   2155       int emitted_verts =
   2156          mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]];
   2157       if (emitted_verts) {
   2158          emit_primitive(mach);
   2159       }
   2160    }
   2161 }
   2162 
   2163 
   2164 /*
   2165  * Fetch four texture samples using STR texture coordinates.
   2166  */
   2167 static void
   2168 fetch_texel( struct tgsi_sampler *sampler,
   2169              const unsigned sview_idx,
   2170              const unsigned sampler_idx,
   2171              const union tgsi_exec_channel *s,
   2172              const union tgsi_exec_channel *t,
   2173              const union tgsi_exec_channel *p,
   2174              const union tgsi_exec_channel *c0,
   2175              const union tgsi_exec_channel *c1,
   2176              float derivs[3][2][TGSI_QUAD_SIZE],
   2177              const int8_t offset[3],
   2178              enum tgsi_sampler_control control,
   2179              union tgsi_exec_channel *r,
   2180              union tgsi_exec_channel *g,
   2181              union tgsi_exec_channel *b,
   2182              union tgsi_exec_channel *a )
   2183 {
   2184    uint j;
   2185    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
   2186 
   2187    /* FIXME: handle explicit derivs, offsets */
   2188    sampler->get_samples(sampler, sview_idx, sampler_idx,
   2189                         s->f, t->f, p->f, c0->f, c1->f, derivs, offset, control, rgba);
   2190 
   2191    for (j = 0; j < 4; j++) {
   2192       r->f[j] = rgba[0][j];
   2193       g->f[j] = rgba[1][j];
   2194       b->f[j] = rgba[2][j];
   2195       a->f[j] = rgba[3][j];
   2196    }
   2197 }
   2198 
   2199 
   2200 #define TEX_MODIFIER_NONE           0
   2201 #define TEX_MODIFIER_PROJECTED      1
   2202 #define TEX_MODIFIER_LOD_BIAS       2
   2203 #define TEX_MODIFIER_EXPLICIT_LOD   3
   2204 #define TEX_MODIFIER_LEVEL_ZERO     4
   2205 #define TEX_MODIFIER_GATHER         5
   2206 
   2207 /*
   2208  * Fetch all 3 (for s,t,r coords) texel offsets, put them into int array.
   2209  */
   2210 static void
   2211 fetch_texel_offsets(struct tgsi_exec_machine *mach,
   2212                     const struct tgsi_full_instruction *inst,
   2213                     int8_t offsets[3])
   2214 {
   2215    if (inst->Texture.NumOffsets == 1) {
   2216       union tgsi_exec_channel index;
   2217       union tgsi_exec_channel offset[3];
   2218       index.i[0] = index.i[1] = index.i[2] = index.i[3] = inst->TexOffsets[0].Index;
   2219       fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
   2220                              inst->TexOffsets[0].SwizzleX, &index, &ZeroVec, &offset[0]);
   2221       fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
   2222                              inst->TexOffsets[0].SwizzleY, &index, &ZeroVec, &offset[1]);
   2223       fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
   2224                              inst->TexOffsets[0].SwizzleZ, &index, &ZeroVec, &offset[2]);
   2225      offsets[0] = offset[0].i[0];
   2226      offsets[1] = offset[1].i[0];
   2227      offsets[2] = offset[2].i[0];
   2228    } else {
   2229      assert(inst->Texture.NumOffsets == 0);
   2230      offsets[0] = offsets[1] = offsets[2] = 0;
   2231    }
   2232 }
   2233 
   2234 
   2235 /*
   2236  * Fetch dx and dy values for one channel (s, t or r).
   2237  * Put dx values into one float array, dy values into another.
   2238  */
   2239 static void
   2240 fetch_assign_deriv_channel(struct tgsi_exec_machine *mach,
   2241                            const struct tgsi_full_instruction *inst,
   2242                            unsigned regdsrcx,
   2243                            unsigned chan,
   2244                            float derivs[2][TGSI_QUAD_SIZE])
   2245 {
   2246    union tgsi_exec_channel d;
   2247    FETCH(&d, regdsrcx, chan);
   2248    derivs[0][0] = d.f[0];
   2249    derivs[0][1] = d.f[1];
   2250    derivs[0][2] = d.f[2];
   2251    derivs[0][3] = d.f[3];
   2252    FETCH(&d, regdsrcx + 1, chan);
   2253    derivs[1][0] = d.f[0];
   2254    derivs[1][1] = d.f[1];
   2255    derivs[1][2] = d.f[2];
   2256    derivs[1][3] = d.f[3];
   2257 }
   2258 
   2259 static uint
   2260 fetch_sampler_unit(struct tgsi_exec_machine *mach,
   2261                    const struct tgsi_full_instruction *inst,
   2262                    uint sampler)
   2263 {
   2264    uint unit = 0;
   2265    int i;
   2266    if (inst->Src[sampler].Register.Indirect) {
   2267       const struct tgsi_full_src_register *reg = &inst->Src[sampler];
   2268       union tgsi_exec_channel indir_index, index2;
   2269       const uint execmask = mach->ExecMask;
   2270       index2.i[0] =
   2271       index2.i[1] =
   2272       index2.i[2] =
   2273       index2.i[3] = reg->Indirect.Index;
   2274 
   2275       fetch_src_file_channel(mach,
   2276                              0,
   2277                              reg->Indirect.File,
   2278                              reg->Indirect.Swizzle,
   2279                              &index2,
   2280                              &ZeroVec,
   2281                              &indir_index);
   2282       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   2283          if (execmask & (1 << i)) {
   2284             unit = inst->Src[sampler].Register.Index + indir_index.i[i];
   2285             break;
   2286          }
   2287       }
   2288 
   2289    } else {
   2290       unit = inst->Src[sampler].Register.Index;
   2291    }
   2292    return unit;
   2293 }
   2294 
   2295 /*
   2296  * execute a texture instruction.
   2297  *
   2298  * modifier is used to control the channel routing for the
   2299  * instruction variants like proj, lod, and texture with lod bias.
   2300  * sampler indicates which src register the sampler is contained in.
   2301  */
   2302 static void
   2303 exec_tex(struct tgsi_exec_machine *mach,
   2304          const struct tgsi_full_instruction *inst,
   2305          uint modifier, uint sampler)
   2306 {
   2307    const union tgsi_exec_channel *args[5], *proj = NULL;
   2308    union tgsi_exec_channel r[5];
   2309    enum tgsi_sampler_control control = TGSI_SAMPLER_LOD_NONE;
   2310    uint chan;
   2311    uint unit;
   2312    int8_t offsets[3];
   2313    int dim, shadow_ref, i;
   2314 
   2315    unit = fetch_sampler_unit(mach, inst, sampler);
   2316    /* always fetch all 3 offsets, overkill but keeps code simple */
   2317    fetch_texel_offsets(mach, inst, offsets);
   2318 
   2319    assert(modifier != TEX_MODIFIER_LEVEL_ZERO);
   2320    assert(inst->Texture.Texture != TGSI_TEXTURE_BUFFER);
   2321 
   2322    dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
   2323    shadow_ref = tgsi_util_get_shadow_ref_src_index(inst->Texture.Texture);
   2324 
   2325    assert(dim <= 4);
   2326    if (shadow_ref >= 0)
   2327       assert(shadow_ref >= dim && shadow_ref < ARRAY_SIZE(args));
   2328 
   2329    /* fetch modifier to the last argument */
   2330    if (modifier != TEX_MODIFIER_NONE) {
   2331       const int last = ARRAY_SIZE(args) - 1;
   2332 
   2333       /* fetch modifier from src0.w or src1.x */
   2334       if (sampler == 1) {
   2335          assert(dim <= TGSI_CHAN_W && shadow_ref != TGSI_CHAN_W);
   2336          FETCH(&r[last], 0, TGSI_CHAN_W);
   2337       }
   2338       else {
   2339          assert(shadow_ref != 4);
   2340          FETCH(&r[last], 1, TGSI_CHAN_X);
   2341       }
   2342 
   2343       if (modifier != TEX_MODIFIER_PROJECTED) {
   2344          args[last] = &r[last];
   2345       }
   2346       else {
   2347          proj = &r[last];
   2348          args[last] = &ZeroVec;
   2349       }
   2350 
   2351       /* point unused arguments to zero vector */
   2352       for (i = dim; i < last; i++)
   2353          args[i] = &ZeroVec;
   2354 
   2355       if (modifier == TEX_MODIFIER_EXPLICIT_LOD)
   2356          control = TGSI_SAMPLER_LOD_EXPLICIT;
   2357       else if (modifier == TEX_MODIFIER_LOD_BIAS)
   2358          control = TGSI_SAMPLER_LOD_BIAS;
   2359       else if (modifier == TEX_MODIFIER_GATHER)
   2360          control = TGSI_SAMPLER_GATHER;
   2361    }
   2362    else {
   2363       for (i = dim; i < ARRAY_SIZE(args); i++)
   2364          args[i] = &ZeroVec;
   2365    }
   2366 
   2367    /* fetch coordinates */
   2368    for (i = 0; i < dim; i++) {
   2369       FETCH(&r[i], 0, TGSI_CHAN_X + i);
   2370 
   2371       if (proj)
   2372          micro_div(&r[i], &r[i], proj);
   2373 
   2374       args[i] = &r[i];
   2375    }
   2376 
   2377    /* fetch reference value */
   2378    if (shadow_ref >= 0) {
   2379       FETCH(&r[shadow_ref], shadow_ref / 4, TGSI_CHAN_X + (shadow_ref % 4));
   2380 
   2381       if (proj)
   2382          micro_div(&r[shadow_ref], &r[shadow_ref], proj);
   2383 
   2384       args[shadow_ref] = &r[shadow_ref];
   2385    }
   2386 
   2387    fetch_texel(mach->Sampler, unit, unit,
   2388          args[0], args[1], args[2], args[3], args[4],
   2389          NULL, offsets, control,
   2390          &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
   2391 
   2392 #if 0
   2393    debug_printf("fetch r: %g %g %g %g\n",
   2394          r[0].f[0], r[0].f[1], r[0].f[2], r[0].f[3]);
   2395    debug_printf("fetch g: %g %g %g %g\n",
   2396          r[1].f[0], r[1].f[1], r[1].f[2], r[1].f[3]);
   2397    debug_printf("fetch b: %g %g %g %g\n",
   2398          r[2].f[0], r[2].f[1], r[2].f[2], r[2].f[3]);
   2399    debug_printf("fetch a: %g %g %g %g\n",
   2400          r[3].f[0], r[3].f[1], r[3].f[2], r[3].f[3]);
   2401 #endif
   2402 
   2403    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2404       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2405          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   2406       }
   2407    }
   2408 }
   2409 
   2410 static void
   2411 exec_lodq(struct tgsi_exec_machine *mach,
   2412           const struct tgsi_full_instruction *inst)
   2413 {
   2414    uint unit;
   2415    int dim;
   2416    int i;
   2417    union tgsi_exec_channel coords[4];
   2418    const union tgsi_exec_channel *args[ARRAY_SIZE(coords)];
   2419    union tgsi_exec_channel r[2];
   2420 
   2421    unit = fetch_sampler_unit(mach, inst, 1);
   2422    dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
   2423    assert(dim <= ARRAY_SIZE(coords));
   2424    /* fetch coordinates */
   2425    for (i = 0; i < dim; i++) {
   2426       FETCH(&coords[i], 0, TGSI_CHAN_X + i);
   2427       args[i] = &coords[i];
   2428    }
   2429    for (i = dim; i < ARRAY_SIZE(coords); i++) {
   2430       args[i] = &ZeroVec;
   2431    }
   2432    mach->Sampler->query_lod(mach->Sampler, unit, unit,
   2433                             args[0]->f,
   2434                             args[1]->f,
   2435                             args[2]->f,
   2436                             args[3]->f,
   2437                             TGSI_SAMPLER_LOD_NONE,
   2438                             r[0].f,
   2439                             r[1].f);
   2440 
   2441    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
   2442       store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X,
   2443                  TGSI_EXEC_DATA_FLOAT);
   2444    }
   2445    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
   2446       store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Y,
   2447                  TGSI_EXEC_DATA_FLOAT);
   2448    }
   2449 }
   2450 
   2451 static void
   2452 exec_txd(struct tgsi_exec_machine *mach,
   2453          const struct tgsi_full_instruction *inst)
   2454 {
   2455    union tgsi_exec_channel r[4];
   2456    float derivs[3][2][TGSI_QUAD_SIZE];
   2457    uint chan;
   2458    uint unit;
   2459    int8_t offsets[3];
   2460 
   2461    unit = fetch_sampler_unit(mach, inst, 3);
   2462    /* always fetch all 3 offsets, overkill but keeps code simple */
   2463    fetch_texel_offsets(mach, inst, offsets);
   2464 
   2465    switch (inst->Texture.Texture) {
   2466    case TGSI_TEXTURE_1D:
   2467       FETCH(&r[0], 0, TGSI_CHAN_X);
   2468 
   2469       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
   2470 
   2471       fetch_texel(mach->Sampler, unit, unit,
   2472                   &r[0], &ZeroVec, &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
   2473                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
   2474                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
   2475       break;
   2476 
   2477    case TGSI_TEXTURE_SHADOW1D:
   2478    case TGSI_TEXTURE_1D_ARRAY:
   2479    case TGSI_TEXTURE_SHADOW1D_ARRAY:
   2480       /* SHADOW1D/1D_ARRAY would not need Y/Z respectively, but don't bother */
   2481       FETCH(&r[0], 0, TGSI_CHAN_X);
   2482       FETCH(&r[1], 0, TGSI_CHAN_Y);
   2483       FETCH(&r[2], 0, TGSI_CHAN_Z);
   2484 
   2485       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
   2486 
   2487       fetch_texel(mach->Sampler, unit, unit,
   2488                   &r[0], &r[1], &r[2], &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
   2489                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
   2490                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
   2491       break;
   2492 
   2493    case TGSI_TEXTURE_2D:
   2494    case TGSI_TEXTURE_RECT:
   2495       FETCH(&r[0], 0, TGSI_CHAN_X);
   2496       FETCH(&r[1], 0, TGSI_CHAN_Y);
   2497 
   2498       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
   2499       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
   2500 
   2501       fetch_texel(mach->Sampler, unit, unit,
   2502                   &r[0], &r[1], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
   2503                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
   2504                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
   2505       break;
   2506 
   2507 
   2508    case TGSI_TEXTURE_SHADOW2D:
   2509    case TGSI_TEXTURE_SHADOWRECT:
   2510    case TGSI_TEXTURE_2D_ARRAY:
   2511    case TGSI_TEXTURE_SHADOW2D_ARRAY:
   2512       /* only SHADOW2D_ARRAY actually needs W */
   2513       FETCH(&r[0], 0, TGSI_CHAN_X);
   2514       FETCH(&r[1], 0, TGSI_CHAN_Y);
   2515       FETCH(&r[2], 0, TGSI_CHAN_Z);
   2516       FETCH(&r[3], 0, TGSI_CHAN_W);
   2517 
   2518       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
   2519       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
   2520 
   2521       fetch_texel(mach->Sampler, unit, unit,
   2522                   &r[0], &r[1], &r[2], &r[3], &ZeroVec,   /* inputs */
   2523                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
   2524                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
   2525       break;
   2526 
   2527    case TGSI_TEXTURE_3D:
   2528    case TGSI_TEXTURE_CUBE:
   2529    case TGSI_TEXTURE_CUBE_ARRAY:
   2530    case TGSI_TEXTURE_SHADOWCUBE:
   2531       /* only TEXTURE_CUBE_ARRAY and TEXTURE_SHADOWCUBE actually need W */
   2532       FETCH(&r[0], 0, TGSI_CHAN_X);
   2533       FETCH(&r[1], 0, TGSI_CHAN_Y);
   2534       FETCH(&r[2], 0, TGSI_CHAN_Z);
   2535       FETCH(&r[3], 0, TGSI_CHAN_W);
   2536 
   2537       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
   2538       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
   2539       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Z, derivs[2]);
   2540 
   2541       fetch_texel(mach->Sampler, unit, unit,
   2542                   &r[0], &r[1], &r[2], &r[3], &ZeroVec,   /* inputs */
   2543                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
   2544                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
   2545       break;
   2546 
   2547    default:
   2548       assert(0);
   2549    }
   2550 
   2551    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2552       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2553          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   2554       }
   2555    }
   2556 }
   2557 
   2558 
   2559 static void
   2560 exec_txf(struct tgsi_exec_machine *mach,
   2561          const struct tgsi_full_instruction *inst)
   2562 {
   2563    union tgsi_exec_channel r[4];
   2564    uint chan;
   2565    uint unit;
   2566    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
   2567    int j;
   2568    int8_t offsets[3];
   2569    unsigned target;
   2570 
   2571    unit = fetch_sampler_unit(mach, inst, 1);
   2572    /* always fetch all 3 offsets, overkill but keeps code simple */
   2573    fetch_texel_offsets(mach, inst, offsets);
   2574 
   2575    IFETCH(&r[3], 0, TGSI_CHAN_W);
   2576 
   2577    if (inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I ||
   2578        inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I_MS) {
   2579       target = mach->SamplerViews[unit].Resource;
   2580    }
   2581    else {
   2582       target = inst->Texture.Texture;
   2583    }
   2584    switch(target) {
   2585    case TGSI_TEXTURE_3D:
   2586    case TGSI_TEXTURE_2D_ARRAY:
   2587    case TGSI_TEXTURE_SHADOW2D_ARRAY:
   2588    case TGSI_TEXTURE_2D_ARRAY_MSAA:
   2589       IFETCH(&r[2], 0, TGSI_CHAN_Z);
   2590       /* fallthrough */
   2591    case TGSI_TEXTURE_2D:
   2592    case TGSI_TEXTURE_RECT:
   2593    case TGSI_TEXTURE_SHADOW1D_ARRAY:
   2594    case TGSI_TEXTURE_SHADOW2D:
   2595    case TGSI_TEXTURE_SHADOWRECT:
   2596    case TGSI_TEXTURE_1D_ARRAY:
   2597    case TGSI_TEXTURE_2D_MSAA:
   2598       IFETCH(&r[1], 0, TGSI_CHAN_Y);
   2599       /* fallthrough */
   2600    case TGSI_TEXTURE_BUFFER:
   2601    case TGSI_TEXTURE_1D:
   2602    case TGSI_TEXTURE_SHADOW1D:
   2603       IFETCH(&r[0], 0, TGSI_CHAN_X);
   2604       break;
   2605    default:
   2606       assert(0);
   2607       break;
   2608    }
   2609 
   2610    mach->Sampler->get_texel(mach->Sampler, unit, r[0].i, r[1].i, r[2].i, r[3].i,
   2611                             offsets, rgba);
   2612 
   2613    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
   2614       r[0].f[j] = rgba[0][j];
   2615       r[1].f[j] = rgba[1][j];
   2616       r[2].f[j] = rgba[2][j];
   2617       r[3].f[j] = rgba[3][j];
   2618    }
   2619 
   2620    if (inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I ||
   2621        inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I_MS) {
   2622       unsigned char swizzles[4];
   2623       swizzles[0] = inst->Src[1].Register.SwizzleX;
   2624       swizzles[1] = inst->Src[1].Register.SwizzleY;
   2625       swizzles[2] = inst->Src[1].Register.SwizzleZ;
   2626       swizzles[3] = inst->Src[1].Register.SwizzleW;
   2627 
   2628       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2629          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2630             store_dest(mach, &r[swizzles[chan]],
   2631                        &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   2632          }
   2633       }
   2634    }
   2635    else {
   2636       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2637          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2638             store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   2639          }
   2640       }
   2641    }
   2642 }
   2643 
   2644 static void
   2645 exec_txq(struct tgsi_exec_machine *mach,
   2646          const struct tgsi_full_instruction *inst)
   2647 {
   2648    int result[4];
   2649    union tgsi_exec_channel r[4], src;
   2650    uint chan;
   2651    uint unit;
   2652    int i,j;
   2653 
   2654    unit = fetch_sampler_unit(mach, inst, 1);
   2655 
   2656    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
   2657 
   2658    /* XXX: This interface can't return per-pixel values */
   2659    mach->Sampler->get_dims(mach->Sampler, unit, src.i[0], result);
   2660 
   2661    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   2662       for (j = 0; j < 4; j++) {
   2663          r[j].i[i] = result[j];
   2664       }
   2665    }
   2666 
   2667    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2668       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2669          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
   2670                     TGSI_EXEC_DATA_INT);
   2671       }
   2672    }
   2673 }
   2674 
   2675 static void
   2676 exec_sample(struct tgsi_exec_machine *mach,
   2677             const struct tgsi_full_instruction *inst,
   2678             uint modifier, boolean compare)
   2679 {
   2680    const uint resource_unit = inst->Src[1].Register.Index;
   2681    const uint sampler_unit = inst->Src[2].Register.Index;
   2682    union tgsi_exec_channel r[5], c1;
   2683    const union tgsi_exec_channel *lod = &ZeroVec;
   2684    enum tgsi_sampler_control control = TGSI_SAMPLER_LOD_NONE;
   2685    uint chan;
   2686    unsigned char swizzles[4];
   2687    int8_t offsets[3];
   2688 
   2689    /* always fetch all 3 offsets, overkill but keeps code simple */
   2690    fetch_texel_offsets(mach, inst, offsets);
   2691 
   2692    assert(modifier != TEX_MODIFIER_PROJECTED);
   2693 
   2694    if (modifier != TEX_MODIFIER_NONE) {
   2695       if (modifier == TEX_MODIFIER_LOD_BIAS) {
   2696          FETCH(&c1, 3, TGSI_CHAN_X);
   2697          lod = &c1;
   2698          control = TGSI_SAMPLER_LOD_BIAS;
   2699       }
   2700       else if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
   2701          FETCH(&c1, 3, TGSI_CHAN_X);
   2702          lod = &c1;
   2703          control = TGSI_SAMPLER_LOD_EXPLICIT;
   2704       }
   2705       else {
   2706          assert(modifier == TEX_MODIFIER_LEVEL_ZERO);
   2707          control = TGSI_SAMPLER_LOD_ZERO;
   2708       }
   2709    }
   2710 
   2711    FETCH(&r[0], 0, TGSI_CHAN_X);
   2712 
   2713    switch (mach->SamplerViews[resource_unit].Resource) {
   2714    case TGSI_TEXTURE_1D:
   2715       if (compare) {
   2716          FETCH(&r[2], 3, TGSI_CHAN_X);
   2717          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
   2718                      &r[0], &ZeroVec, &r[2], &ZeroVec, lod, /* S, T, P, C, LOD */
   2719                      NULL, offsets, control,
   2720                      &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
   2721       }
   2722       else {
   2723          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
   2724                      &r[0], &ZeroVec, &ZeroVec, &ZeroVec, lod, /* S, T, P, C, LOD */
   2725                      NULL, offsets, control,
   2726                      &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
   2727       }
   2728       break;
   2729 
   2730    case TGSI_TEXTURE_1D_ARRAY:
   2731    case TGSI_TEXTURE_2D:
   2732    case TGSI_TEXTURE_RECT:
   2733       FETCH(&r[1], 0, TGSI_CHAN_Y);
   2734       if (compare) {
   2735          FETCH(&r[2], 3, TGSI_CHAN_X);
   2736          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
   2737                      &r[0], &r[1], &r[2], &ZeroVec, lod,    /* S, T, P, C, LOD */
   2738                      NULL, offsets, control,
   2739                      &r[0], &r[1], &r[2], &r[3]);  /* outputs */
   2740       }
   2741       else {
   2742          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
   2743                      &r[0], &r[1], &ZeroVec, &ZeroVec, lod,    /* S, T, P, C, LOD */
   2744                      NULL, offsets, control,
   2745                      &r[0], &r[1], &r[2], &r[3]);  /* outputs */
   2746       }
   2747       break;
   2748 
   2749    case TGSI_TEXTURE_2D_ARRAY:
   2750    case TGSI_TEXTURE_3D:
   2751    case TGSI_TEXTURE_CUBE:
   2752       FETCH(&r[1], 0, TGSI_CHAN_Y);
   2753       FETCH(&r[2], 0, TGSI_CHAN_Z);
   2754       if(compare) {
   2755          FETCH(&r[3], 3, TGSI_CHAN_X);
   2756          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
   2757                      &r[0], &r[1], &r[2], &r[3], lod,
   2758                      NULL, offsets, control,
   2759                      &r[0], &r[1], &r[2], &r[3]);
   2760       }
   2761       else {
   2762          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
   2763                      &r[0], &r[1], &r[2], &ZeroVec, lod,
   2764                      NULL, offsets, control,
   2765                      &r[0], &r[1], &r[2], &r[3]);
   2766       }
   2767       break;
   2768 
   2769    case TGSI_TEXTURE_CUBE_ARRAY:
   2770       FETCH(&r[1], 0, TGSI_CHAN_Y);
   2771       FETCH(&r[2], 0, TGSI_CHAN_Z);
   2772       FETCH(&r[3], 0, TGSI_CHAN_W);
   2773       if(compare) {
   2774          FETCH(&r[4], 3, TGSI_CHAN_X);
   2775          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
   2776                      &r[0], &r[1], &r[2], &r[3], &r[4],
   2777                      NULL, offsets, control,
   2778                      &r[0], &r[1], &r[2], &r[3]);
   2779       }
   2780       else {
   2781          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
   2782                      &r[0], &r[1], &r[2], &r[3], lod,
   2783                      NULL, offsets, control,
   2784                      &r[0], &r[1], &r[2], &r[3]);
   2785       }
   2786       break;
   2787 
   2788 
   2789    default:
   2790       assert(0);
   2791    }
   2792 
   2793    swizzles[0] = inst->Src[1].Register.SwizzleX;
   2794    swizzles[1] = inst->Src[1].Register.SwizzleY;
   2795    swizzles[2] = inst->Src[1].Register.SwizzleZ;
   2796    swizzles[3] = inst->Src[1].Register.SwizzleW;
   2797 
   2798    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2799       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2800          store_dest(mach, &r[swizzles[chan]],
   2801                     &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   2802       }
   2803    }
   2804 }
   2805 
   2806 static void
   2807 exec_sample_d(struct tgsi_exec_machine *mach,
   2808               const struct tgsi_full_instruction *inst)
   2809 {
   2810    const uint resource_unit = inst->Src[1].Register.Index;
   2811    const uint sampler_unit = inst->Src[2].Register.Index;
   2812    union tgsi_exec_channel r[4];
   2813    float derivs[3][2][TGSI_QUAD_SIZE];
   2814    uint chan;
   2815    unsigned char swizzles[4];
   2816    int8_t offsets[3];
   2817 
   2818    /* always fetch all 3 offsets, overkill but keeps code simple */
   2819    fetch_texel_offsets(mach, inst, offsets);
   2820 
   2821    FETCH(&r[0], 0, TGSI_CHAN_X);
   2822 
   2823    switch (mach->SamplerViews[resource_unit].Resource) {
   2824    case TGSI_TEXTURE_1D:
   2825    case TGSI_TEXTURE_1D_ARRAY:
   2826       /* only 1D array actually needs Y */
   2827       FETCH(&r[1], 0, TGSI_CHAN_Y);
   2828 
   2829       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
   2830 
   2831       fetch_texel(mach->Sampler, resource_unit, sampler_unit,
   2832                   &r[0], &r[1], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
   2833                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
   2834                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
   2835       break;
   2836 
   2837    case TGSI_TEXTURE_2D:
   2838    case TGSI_TEXTURE_RECT:
   2839    case TGSI_TEXTURE_2D_ARRAY:
   2840       /* only 2D array actually needs Z */
   2841       FETCH(&r[1], 0, TGSI_CHAN_Y);
   2842       FETCH(&r[2], 0, TGSI_CHAN_Z);
   2843 
   2844       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
   2845       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Y, derivs[1]);
   2846 
   2847       fetch_texel(mach->Sampler, resource_unit, sampler_unit,
   2848                   &r[0], &r[1], &r[2], &ZeroVec, &ZeroVec,   /* inputs */
   2849                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
   2850                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
   2851       break;
   2852 
   2853    case TGSI_TEXTURE_3D:
   2854    case TGSI_TEXTURE_CUBE:
   2855    case TGSI_TEXTURE_CUBE_ARRAY:
   2856       /* only cube array actually needs W */
   2857       FETCH(&r[1], 0, TGSI_CHAN_Y);
   2858       FETCH(&r[2], 0, TGSI_CHAN_Z);
   2859       FETCH(&r[3], 0, TGSI_CHAN_W);
   2860 
   2861       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
   2862       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Y, derivs[1]);
   2863       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Z, derivs[2]);
   2864 
   2865       fetch_texel(mach->Sampler, resource_unit, sampler_unit,
   2866                   &r[0], &r[1], &r[2], &r[3], &ZeroVec,
   2867                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
   2868                   &r[0], &r[1], &r[2], &r[3]);
   2869       break;
   2870 
   2871    default:
   2872       assert(0);
   2873    }
   2874 
   2875    swizzles[0] = inst->Src[1].Register.SwizzleX;
   2876    swizzles[1] = inst->Src[1].Register.SwizzleY;
   2877    swizzles[2] = inst->Src[1].Register.SwizzleZ;
   2878    swizzles[3] = inst->Src[1].Register.SwizzleW;
   2879 
   2880    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2881       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2882          store_dest(mach, &r[swizzles[chan]],
   2883                     &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   2884       }
   2885    }
   2886 }
   2887 
   2888 
   2889 /**
   2890  * Evaluate a constant-valued coefficient at the position of the
   2891  * current quad.
   2892  */
   2893 static void
   2894 eval_constant_coef(
   2895    struct tgsi_exec_machine *mach,
   2896    unsigned attrib,
   2897    unsigned chan )
   2898 {
   2899    unsigned i;
   2900 
   2901    for( i = 0; i < TGSI_QUAD_SIZE; i++ ) {
   2902       mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
   2903    }
   2904 }
   2905 
   2906 /**
   2907  * Evaluate a linear-valued coefficient at the position of the
   2908  * current quad.
   2909  */
   2910 static void
   2911 eval_linear_coef(
   2912    struct tgsi_exec_machine *mach,
   2913    unsigned attrib,
   2914    unsigned chan )
   2915 {
   2916    const float x = mach->QuadPos.xyzw[0].f[0];
   2917    const float y = mach->QuadPos.xyzw[1].f[0];
   2918    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
   2919    const float dady = mach->InterpCoefs[attrib].dady[chan];
   2920    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
   2921    mach->Inputs[attrib].xyzw[chan].f[0] = a0;
   2922    mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
   2923    mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
   2924    mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
   2925 }
   2926 
   2927 /**
   2928  * Evaluate a perspective-valued coefficient at the position of the
   2929  * current quad.
   2930  */
   2931 static void
   2932 eval_perspective_coef(
   2933    struct tgsi_exec_machine *mach,
   2934    unsigned attrib,
   2935    unsigned chan )
   2936 {
   2937    const float x = mach->QuadPos.xyzw[0].f[0];
   2938    const float y = mach->QuadPos.xyzw[1].f[0];
   2939    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
   2940    const float dady = mach->InterpCoefs[attrib].dady[chan];
   2941    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
   2942    const float *w = mach->QuadPos.xyzw[3].f;
   2943    /* divide by W here */
   2944    mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
   2945    mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
   2946    mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
   2947    mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
   2948 }
   2949 
   2950 
   2951 typedef void (* eval_coef_func)(
   2952    struct tgsi_exec_machine *mach,
   2953    unsigned attrib,
   2954    unsigned chan );
   2955 
   2956 static void
   2957 exec_declaration(struct tgsi_exec_machine *mach,
   2958                  const struct tgsi_full_declaration *decl)
   2959 {
   2960    if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
   2961       mach->SamplerViews[decl->Range.First] = decl->SamplerView;
   2962       return;
   2963    }
   2964 
   2965    if (mach->ShaderType == PIPE_SHADER_FRAGMENT) {
   2966       if (decl->Declaration.File == TGSI_FILE_INPUT) {
   2967          uint first, last, mask;
   2968 
   2969          first = decl->Range.First;
   2970          last = decl->Range.Last;
   2971          mask = decl->Declaration.UsageMask;
   2972 
   2973          /* XXX we could remove this special-case code since
   2974           * mach->InterpCoefs[first].a0 should already have the
   2975           * front/back-face value.  But we should first update the
   2976           * ureg code to emit the right UsageMask value (WRITEMASK_X).
   2977           * Then, we could remove the tgsi_exec_machine::Face field.
   2978           */
   2979          /* XXX make FACE a system value */
   2980          if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
   2981             uint i;
   2982 
   2983             assert(decl->Semantic.Index == 0);
   2984             assert(first == last);
   2985 
   2986             for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   2987                mach->Inputs[first].xyzw[0].f[i] = mach->Face;
   2988             }
   2989          } else {
   2990             eval_coef_func eval;
   2991             uint i, j;
   2992 
   2993             switch (decl->Interp.Interpolate) {
   2994             case TGSI_INTERPOLATE_CONSTANT:
   2995                eval = eval_constant_coef;
   2996                break;
   2997 
   2998             case TGSI_INTERPOLATE_LINEAR:
   2999                eval = eval_linear_coef;
   3000                break;
   3001 
   3002             case TGSI_INTERPOLATE_PERSPECTIVE:
   3003                eval = eval_perspective_coef;
   3004                break;
   3005 
   3006             case TGSI_INTERPOLATE_COLOR:
   3007                eval = mach->flatshade_color ? eval_constant_coef : eval_perspective_coef;
   3008                break;
   3009 
   3010             default:
   3011                assert(0);
   3012                return;
   3013             }
   3014 
   3015             for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
   3016                if (mask & (1 << j)) {
   3017                   for (i = first; i <= last; i++) {
   3018                      eval(mach, i, j);
   3019                   }
   3020                }
   3021             }
   3022          }
   3023 
   3024          if (DEBUG_EXECUTION) {
   3025             uint i, j;
   3026             for (i = first; i <= last; ++i) {
   3027                debug_printf("IN[%2u] = ", i);
   3028                for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
   3029                   if (j > 0) {
   3030                      debug_printf("         ");
   3031                   }
   3032                   debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
   3033                                mach->Inputs[i].xyzw[0].f[j], mach->Inputs[i].xyzw[0].u[j],
   3034                                mach->Inputs[i].xyzw[1].f[j], mach->Inputs[i].xyzw[1].u[j],
   3035                                mach->Inputs[i].xyzw[2].f[j], mach->Inputs[i].xyzw[2].u[j],
   3036                                mach->Inputs[i].xyzw[3].f[j], mach->Inputs[i].xyzw[3].u[j]);
   3037                }
   3038             }
   3039          }
   3040       }
   3041    }
   3042 
   3043 }
   3044 
   3045 typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
   3046                                 const union tgsi_exec_channel *src);
   3047 
   3048 static void
   3049 exec_scalar_unary(struct tgsi_exec_machine *mach,
   3050                   const struct tgsi_full_instruction *inst,
   3051                   micro_unary_op op,
   3052                   enum tgsi_exec_datatype dst_datatype,
   3053                   enum tgsi_exec_datatype src_datatype)
   3054 {
   3055    unsigned int chan;
   3056    union tgsi_exec_channel src;
   3057    union tgsi_exec_channel dst;
   3058 
   3059    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, src_datatype);
   3060    op(&dst, &src);
   3061    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3062       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3063          store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
   3064       }
   3065    }
   3066 }
   3067 
   3068 static void
   3069 exec_vector_unary(struct tgsi_exec_machine *mach,
   3070                   const struct tgsi_full_instruction *inst,
   3071                   micro_unary_op op,
   3072                   enum tgsi_exec_datatype dst_datatype,
   3073                   enum tgsi_exec_datatype src_datatype)
   3074 {
   3075    unsigned int chan;
   3076    struct tgsi_exec_vector dst;
   3077 
   3078    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3079       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3080          union tgsi_exec_channel src;
   3081 
   3082          fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
   3083          op(&dst.xyzw[chan], &src);
   3084       }
   3085    }
   3086    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3087       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3088          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
   3089       }
   3090    }
   3091 }
   3092 
   3093 typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
   3094                                  const union tgsi_exec_channel *src0,
   3095                                  const union tgsi_exec_channel *src1);
   3096 
   3097 static void
   3098 exec_scalar_binary(struct tgsi_exec_machine *mach,
   3099                    const struct tgsi_full_instruction *inst,
   3100                    micro_binary_op op,
   3101                    enum tgsi_exec_datatype dst_datatype,
   3102                    enum tgsi_exec_datatype src_datatype)
   3103 {
   3104    unsigned int chan;
   3105    union tgsi_exec_channel src[2];
   3106    union tgsi_exec_channel dst;
   3107 
   3108    fetch_source(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, src_datatype);
   3109    fetch_source(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, src_datatype);
   3110    op(&dst, &src[0], &src[1]);
   3111    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3112       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3113          store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
   3114       }
   3115    }
   3116 }
   3117 
   3118 static void
   3119 exec_vector_binary(struct tgsi_exec_machine *mach,
   3120                    const struct tgsi_full_instruction *inst,
   3121                    micro_binary_op op,
   3122                    enum tgsi_exec_datatype dst_datatype,
   3123                    enum tgsi_exec_datatype src_datatype)
   3124 {
   3125    unsigned int chan;
   3126    struct tgsi_exec_vector dst;
   3127 
   3128    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3129       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3130          union tgsi_exec_channel src[2];
   3131 
   3132          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
   3133          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
   3134          op(&dst.xyzw[chan], &src[0], &src[1]);
   3135       }
   3136    }
   3137    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3138       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3139          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
   3140       }
   3141    }
   3142 }
   3143 
   3144 typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
   3145                                   const union tgsi_exec_channel *src0,
   3146                                   const union tgsi_exec_channel *src1,
   3147                                   const union tgsi_exec_channel *src2);
   3148 
   3149 static void
   3150 exec_vector_trinary(struct tgsi_exec_machine *mach,
   3151                     const struct tgsi_full_instruction *inst,
   3152                     micro_trinary_op op,
   3153                     enum tgsi_exec_datatype dst_datatype,
   3154                     enum tgsi_exec_datatype src_datatype)
   3155 {
   3156    unsigned int chan;
   3157    struct tgsi_exec_vector dst;
   3158 
   3159    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3160       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3161          union tgsi_exec_channel src[3];
   3162 
   3163          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
   3164          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
   3165          fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
   3166          op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
   3167       }
   3168    }
   3169    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3170       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3171          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
   3172       }
   3173    }
   3174 }
   3175 
   3176 typedef void (* micro_quaternary_op)(union tgsi_exec_channel *dst,
   3177                                      const union tgsi_exec_channel *src0,
   3178                                      const union tgsi_exec_channel *src1,
   3179                                      const union tgsi_exec_channel *src2,
   3180                                      const union tgsi_exec_channel *src3);
   3181 
   3182 static void
   3183 exec_vector_quaternary(struct tgsi_exec_machine *mach,
   3184                        const struct tgsi_full_instruction *inst,
   3185                        micro_quaternary_op op,
   3186                        enum tgsi_exec_datatype dst_datatype,
   3187                        enum tgsi_exec_datatype src_datatype)
   3188 {
   3189    unsigned int chan;
   3190    struct tgsi_exec_vector dst;
   3191 
   3192    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3193       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3194          union tgsi_exec_channel src[4];
   3195 
   3196          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
   3197          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
   3198          fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
   3199          fetch_source(mach, &src[3], &inst->Src[3], chan, src_datatype);
   3200          op(&dst.xyzw[chan], &src[0], &src[1], &src[2], &src[3]);
   3201       }
   3202    }
   3203    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3204       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3205          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
   3206       }
   3207    }
   3208 }
   3209 
   3210 static void
   3211 exec_dp3(struct tgsi_exec_machine *mach,
   3212          const struct tgsi_full_instruction *inst)
   3213 {
   3214    unsigned int chan;
   3215    union tgsi_exec_channel arg[3];
   3216 
   3217    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3218    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3219    micro_mul(&arg[2], &arg[0], &arg[1]);
   3220 
   3221    for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_Z; chan++) {
   3222       fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
   3223       fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
   3224       micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
   3225    }
   3226 
   3227    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3228       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3229          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   3230       }
   3231    }
   3232 }
   3233 
   3234 static void
   3235 exec_dp4(struct tgsi_exec_machine *mach,
   3236          const struct tgsi_full_instruction *inst)
   3237 {
   3238    unsigned int chan;
   3239    union tgsi_exec_channel arg[3];
   3240 
   3241    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3242    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3243    micro_mul(&arg[2], &arg[0], &arg[1]);
   3244 
   3245    for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_W; chan++) {
   3246       fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
   3247       fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
   3248       micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
   3249    }
   3250 
   3251    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3252       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3253          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   3254       }
   3255    }
   3256 }
   3257 
   3258 static void
   3259 exec_dp2a(struct tgsi_exec_machine *mach,
   3260           const struct tgsi_full_instruction *inst)
   3261 {
   3262    unsigned int chan;
   3263    union tgsi_exec_channel arg[3];
   3264 
   3265    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3266    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3267    micro_mul(&arg[2], &arg[0], &arg[1]);
   3268 
   3269    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3270    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3271    micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
   3272 
   3273    fetch_source(mach, &arg[1], &inst->Src[2], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3274    micro_add(&arg[0], &arg[0], &arg[1]);
   3275 
   3276    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3277       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3278          store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   3279       }
   3280    }
   3281 }
   3282 
   3283 static void
   3284 exec_dph(struct tgsi_exec_machine *mach,
   3285          const struct tgsi_full_instruction *inst)
   3286 {
   3287    unsigned int chan;
   3288    union tgsi_exec_channel arg[3];
   3289 
   3290    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3291    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3292    micro_mul(&arg[2], &arg[0], &arg[1]);
   3293 
   3294    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3295    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3296    micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
   3297 
   3298    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   3299    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   3300    micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
   3301 
   3302    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   3303    micro_add(&arg[0], &arg[0], &arg[1]);
   3304 
   3305    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3306       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3307          store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   3308       }
   3309    }
   3310 }
   3311 
   3312 static void
   3313 exec_dp2(struct tgsi_exec_machine *mach,
   3314          const struct tgsi_full_instruction *inst)
   3315 {
   3316    unsigned int chan;
   3317    union tgsi_exec_channel arg[3];
   3318 
   3319    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3320    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3321    micro_mul(&arg[2], &arg[0], &arg[1]);
   3322 
   3323    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3324    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3325    micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
   3326 
   3327    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3328       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3329          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   3330       }
   3331    }
   3332 }
   3333 
   3334 static void
   3335 exec_pk2h(struct tgsi_exec_machine *mach,
   3336           const struct tgsi_full_instruction *inst)
   3337 {
   3338    unsigned chan;
   3339    union tgsi_exec_channel arg[2], dst;
   3340 
   3341    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3342    fetch_source(mach, &arg[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3343    for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
   3344       dst.u[chan] = util_float_to_half(arg[0].f[chan]) |
   3345          (util_float_to_half(arg[1].f[chan]) << 16);
   3346    }
   3347    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3348       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3349          store_dest(mach, &dst, &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_UINT);
   3350       }
   3351    }
   3352 }
   3353 
   3354 static void
   3355 exec_up2h(struct tgsi_exec_machine *mach,
   3356           const struct tgsi_full_instruction *inst)
   3357 {
   3358    unsigned chan;
   3359    union tgsi_exec_channel arg, dst[2];
   3360 
   3361    fetch_source(mach, &arg, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
   3362    for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
   3363       dst[0].f[chan] = util_half_to_float(arg.u[chan] & 0xffff);
   3364       dst[1].f[chan] = util_half_to_float(arg.u[chan] >> 16);
   3365    }
   3366    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3367       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3368          store_dest(mach, &dst[chan & 1], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   3369       }
   3370    }
   3371 }
   3372 
   3373 static void
   3374 exec_scs(struct tgsi_exec_machine *mach,
   3375          const struct tgsi_full_instruction *inst)
   3376 {
   3377    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) {
   3378       union tgsi_exec_channel arg;
   3379       union tgsi_exec_channel result;
   3380 
   3381       fetch_source(mach, &arg, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3382 
   3383       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
   3384          micro_cos(&result, &arg);
   3385          store_dest(mach, &result, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3386       }
   3387       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
   3388          micro_sin(&result, &arg);
   3389          store_dest(mach, &result, &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3390       }
   3391    }
   3392    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
   3393       store_dest(mach, &ZeroVec, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   3394    }
   3395    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
   3396       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   3397    }
   3398 }
   3399 
   3400 static void
   3401 exec_xpd(struct tgsi_exec_machine *mach,
   3402          const struct tgsi_full_instruction *inst)
   3403 {
   3404    union tgsi_exec_channel r[6];
   3405    union tgsi_exec_channel d[3];
   3406 
   3407    fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3408    fetch_source(mach, &r[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   3409 
   3410    micro_mul(&r[2], &r[0], &r[1]);
   3411 
   3412    fetch_source(mach, &r[3], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   3413    fetch_source(mach, &r[4], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3414 
   3415    micro_mul(&r[5], &r[3], &r[4] );
   3416    micro_sub(&d[TGSI_CHAN_X], &r[2], &r[5]);
   3417 
   3418    fetch_source(mach, &r[2], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3419 
   3420    micro_mul(&r[3], &r[3], &r[2]);
   3421 
   3422    fetch_source(mach, &r[5], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3423 
   3424    micro_mul(&r[1], &r[1], &r[5]);
   3425    micro_sub(&d[TGSI_CHAN_Y], &r[3], &r[1]);
   3426 
   3427    micro_mul(&r[5], &r[5], &r[4]);
   3428    micro_mul(&r[0], &r[0], &r[2]);
   3429    micro_sub(&d[TGSI_CHAN_Z], &r[5], &r[0]);
   3430 
   3431    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
   3432       store_dest(mach, &d[TGSI_CHAN_X], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3433    }
   3434    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
   3435       store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3436    }
   3437    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
   3438       store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   3439    }
   3440    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
   3441       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   3442    }
   3443 }
   3444 
   3445 static void
   3446 exec_dst(struct tgsi_exec_machine *mach,
   3447          const struct tgsi_full_instruction *inst)
   3448 {
   3449    union tgsi_exec_channel r[2];
   3450    union tgsi_exec_channel d[4];
   3451 
   3452    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
   3453       fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3454       fetch_source(mach, &r[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3455       micro_mul(&d[TGSI_CHAN_Y], &r[0], &r[1]);
   3456    }
   3457    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
   3458       fetch_source(mach, &d[TGSI_CHAN_Z], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   3459    }
   3460    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
   3461       fetch_source(mach, &d[TGSI_CHAN_W], &inst->Src[1], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   3462    }
   3463 
   3464    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
   3465       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3466    }
   3467    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
   3468       store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3469    }
   3470    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
   3471       store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   3472    }
   3473    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
   3474       store_dest(mach, &d[TGSI_CHAN_W], &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   3475    }
   3476 }
   3477 
   3478 static void
   3479 exec_log(struct tgsi_exec_machine *mach,
   3480          const struct tgsi_full_instruction *inst)
   3481 {
   3482    union tgsi_exec_channel r[3];
   3483 
   3484    fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3485    micro_abs(&r[2], &r[0]);  /* r2 = abs(r0) */
   3486    micro_lg2(&r[1], &r[2]);  /* r1 = lg2(r2) */
   3487    micro_flr(&r[0], &r[1]);  /* r0 = floor(r1) */
   3488    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
   3489       store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3490    }
   3491    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
   3492       micro_exp2(&r[0], &r[0]);       /* r0 = 2 ^ r0 */
   3493       micro_div(&r[0], &r[2], &r[0]); /* r0 = r2 / r0 */
   3494       store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3495    }
   3496    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
   3497       store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   3498    }
   3499    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
   3500       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   3501    }
   3502 }
   3503 
   3504 static void
   3505 exec_exp(struct tgsi_exec_machine *mach,
   3506          const struct tgsi_full_instruction *inst)
   3507 {
   3508    union tgsi_exec_channel r[3];
   3509 
   3510    fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3511    micro_flr(&r[1], &r[0]);  /* r1 = floor(r0) */
   3512    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
   3513       micro_exp2(&r[2], &r[1]);       /* r2 = 2 ^ r1 */
   3514       store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3515    }
   3516    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
   3517       micro_sub(&r[2], &r[0], &r[1]); /* r2 = r0 - r1 */
   3518       store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3519    }
   3520    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
   3521       micro_exp2(&r[2], &r[0]);       /* r2 = 2 ^ r0 */
   3522       store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   3523    }
   3524    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
   3525       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   3526    }
   3527 }
   3528 
   3529 static void
   3530 exec_lit(struct tgsi_exec_machine *mach,
   3531          const struct tgsi_full_instruction *inst)
   3532 {
   3533    union tgsi_exec_channel r[3];
   3534    union tgsi_exec_channel d[3];
   3535 
   3536    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YZ) {
   3537       fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3538       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
   3539          fetch_source(mach, &r[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3540          micro_max(&r[1], &r[1], &ZeroVec);
   3541 
   3542          fetch_source(mach, &r[2], &inst->Src[0], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   3543          micro_min(&r[2], &r[2], &P128Vec);
   3544          micro_max(&r[2], &r[2], &M128Vec);
   3545          micro_pow(&r[1], &r[1], &r[2]);
   3546          micro_lt(&d[TGSI_CHAN_Z], &ZeroVec, &r[0], &r[1], &ZeroVec);
   3547          store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   3548       }
   3549       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
   3550          micro_max(&d[TGSI_CHAN_Y], &r[0], &ZeroVec);
   3551          store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3552       }
   3553    }
   3554    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
   3555       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3556    }
   3557 
   3558    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
   3559       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   3560    }
   3561 }
   3562 
   3563 static void
   3564 exec_break(struct tgsi_exec_machine *mach)
   3565 {
   3566    if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
   3567       /* turn off loop channels for each enabled exec channel */
   3568       mach->LoopMask &= ~mach->ExecMask;
   3569       /* Todo: if mach->LoopMask == 0, jump to end of loop */
   3570       UPDATE_EXEC_MASK(mach);
   3571    } else {
   3572       assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
   3573 
   3574       mach->Switch.mask = 0x0;
   3575 
   3576       UPDATE_EXEC_MASK(mach);
   3577    }
   3578 }
   3579 
   3580 static void
   3581 exec_switch(struct tgsi_exec_machine *mach,
   3582             const struct tgsi_full_instruction *inst)
   3583 {
   3584    assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
   3585    assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
   3586 
   3587    mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
   3588    fetch_source(mach, &mach->Switch.selector, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
   3589    mach->Switch.mask = 0x0;
   3590    mach->Switch.defaultMask = 0x0;
   3591 
   3592    mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
   3593    mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
   3594 
   3595    UPDATE_EXEC_MASK(mach);
   3596 }
   3597 
   3598 static void
   3599 exec_case(struct tgsi_exec_machine *mach,
   3600           const struct tgsi_full_instruction *inst)
   3601 {
   3602    uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
   3603    union tgsi_exec_channel src;
   3604    uint mask = 0;
   3605 
   3606    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
   3607 
   3608    if (mach->Switch.selector.u[0] == src.u[0]) {
   3609       mask |= 0x1;
   3610    }
   3611    if (mach->Switch.selector.u[1] == src.u[1]) {
   3612       mask |= 0x2;
   3613    }
   3614    if (mach->Switch.selector.u[2] == src.u[2]) {
   3615       mask |= 0x4;
   3616    }
   3617    if (mach->Switch.selector.u[3] == src.u[3]) {
   3618       mask |= 0x8;
   3619    }
   3620 
   3621    mach->Switch.defaultMask |= mask;
   3622 
   3623    mach->Switch.mask |= mask & prevMask;
   3624 
   3625    UPDATE_EXEC_MASK(mach);
   3626 }
   3627 
   3628 /* FIXME: this will only work if default is last */
   3629 static void
   3630 exec_default(struct tgsi_exec_machine *mach)
   3631 {
   3632    uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
   3633 
   3634    mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
   3635 
   3636    UPDATE_EXEC_MASK(mach);
   3637 }
   3638 
   3639 static void
   3640 exec_endswitch(struct tgsi_exec_machine *mach)
   3641 {
   3642    mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
   3643    mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
   3644 
   3645    UPDATE_EXEC_MASK(mach);
   3646 }
   3647 
   3648 typedef void (* micro_dop)(union tgsi_double_channel *dst,
   3649                            const union tgsi_double_channel *src);
   3650 
   3651 typedef void (* micro_dop_sop)(union tgsi_double_channel *dst,
   3652                                const union tgsi_double_channel *src0,
   3653                                union tgsi_exec_channel *src1);
   3654 
   3655 typedef void (* micro_dop_s)(union tgsi_double_channel *dst,
   3656                              const union tgsi_exec_channel *src);
   3657 
   3658 typedef void (* micro_sop_d)(union tgsi_exec_channel *dst,
   3659                              const union tgsi_double_channel *src);
   3660 
   3661 static void
   3662 fetch_double_channel(struct tgsi_exec_machine *mach,
   3663                      union tgsi_double_channel *chan,
   3664                      const struct tgsi_full_src_register *reg,
   3665                      uint chan_0,
   3666                      uint chan_1)
   3667 {
   3668    union tgsi_exec_channel src[2];
   3669    uint i;
   3670 
   3671    fetch_source_d(mach, &src[0], reg, chan_0, TGSI_EXEC_DATA_UINT);
   3672    fetch_source_d(mach, &src[1], reg, chan_1, TGSI_EXEC_DATA_UINT);
   3673 
   3674    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   3675       chan->u[i][0] = src[0].u[i];
   3676       chan->u[i][1] = src[1].u[i];
   3677    }
   3678    if (reg->Register.Absolute) {
   3679       micro_dabs(chan, chan);
   3680    }
   3681    if (reg->Register.Negate) {
   3682       micro_dneg(chan, chan);
   3683    }
   3684 }
   3685 
   3686 static void
   3687 store_double_channel(struct tgsi_exec_machine *mach,
   3688                      const union tgsi_double_channel *chan,
   3689                      const struct tgsi_full_dst_register *reg,
   3690                      const struct tgsi_full_instruction *inst,
   3691                      uint chan_0,
   3692                      uint chan_1)
   3693 {
   3694    union tgsi_exec_channel dst[2];
   3695    uint i;
   3696    union tgsi_double_channel temp;
   3697    const uint execmask = mach->ExecMask;
   3698 
   3699    if (!inst->Instruction.Saturate) {
   3700       for (i = 0; i < TGSI_QUAD_SIZE; i++)
   3701          if (execmask & (1 << i)) {
   3702             dst[0].u[i] = chan->u[i][0];
   3703             dst[1].u[i] = chan->u[i][1];
   3704          }
   3705    }
   3706    else {
   3707       for (i = 0; i < TGSI_QUAD_SIZE; i++)
   3708          if (execmask & (1 << i)) {
   3709             if (chan->d[i] < 0.0)
   3710                temp.d[i] = 0.0;
   3711             else if (chan->d[i] > 1.0)
   3712                temp.d[i] = 1.0;
   3713             else
   3714                temp.d[i] = chan->d[i];
   3715 
   3716             dst[0].u[i] = temp.u[i][0];
   3717             dst[1].u[i] = temp.u[i][1];
   3718          }
   3719    }
   3720 
   3721    store_dest_double(mach, &dst[0], reg, inst, chan_0, TGSI_EXEC_DATA_UINT);
   3722    if (chan_1 != -1)
   3723       store_dest_double(mach, &dst[1], reg, inst, chan_1, TGSI_EXEC_DATA_UINT);
   3724 }
   3725 
   3726 static void
   3727 exec_double_unary(struct tgsi_exec_machine *mach,
   3728                   const struct tgsi_full_instruction *inst,
   3729                   micro_dop op)
   3730 {
   3731    union tgsi_double_channel src;
   3732    union tgsi_double_channel dst;
   3733 
   3734    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
   3735       fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
   3736       op(&dst, &src);
   3737       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
   3738    }
   3739    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
   3740       fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
   3741       op(&dst, &src);
   3742       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
   3743    }
   3744 }
   3745 
   3746 static void
   3747 exec_double_binary(struct tgsi_exec_machine *mach,
   3748                    const struct tgsi_full_instruction *inst,
   3749                    micro_dop op,
   3750                    enum tgsi_exec_datatype dst_datatype)
   3751 {
   3752    union tgsi_double_channel src[2];
   3753    union tgsi_double_channel dst;
   3754    int first_dest_chan, second_dest_chan;
   3755    int wmask;
   3756 
   3757    wmask = inst->Dst[0].Register.WriteMask;
   3758    /* these are & because of the way DSLT etc store their destinations */
   3759    if (wmask & TGSI_WRITEMASK_XY) {
   3760       first_dest_chan = TGSI_CHAN_X;
   3761       second_dest_chan = TGSI_CHAN_Y;
   3762       if (dst_datatype == TGSI_EXEC_DATA_UINT) {
   3763          first_dest_chan = (wmask & TGSI_WRITEMASK_X) ? TGSI_CHAN_X : TGSI_CHAN_Y;
   3764          second_dest_chan = -1;
   3765       }
   3766 
   3767       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
   3768       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, TGSI_CHAN_Y);
   3769       op(&dst, src);
   3770       store_double_channel(mach, &dst, &inst->Dst[0], inst, first_dest_chan, second_dest_chan);
   3771    }
   3772 
   3773    if (wmask & TGSI_WRITEMASK_ZW) {
   3774       first_dest_chan = TGSI_CHAN_Z;
   3775       second_dest_chan = TGSI_CHAN_W;
   3776       if (dst_datatype == TGSI_EXEC_DATA_UINT) {
   3777          first_dest_chan = (wmask & TGSI_WRITEMASK_Z) ? TGSI_CHAN_Z : TGSI_CHAN_W;
   3778          second_dest_chan = -1;
   3779       }
   3780 
   3781       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
   3782       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_CHAN_W);
   3783       op(&dst, src);
   3784       store_double_channel(mach, &dst, &inst->Dst[0], inst, first_dest_chan, second_dest_chan);
   3785    }
   3786 }
   3787 
   3788 static void
   3789 exec_double_trinary(struct tgsi_exec_machine *mach,
   3790                     const struct tgsi_full_instruction *inst,
   3791                     micro_dop op)
   3792 {
   3793    union tgsi_double_channel src[3];
   3794    union tgsi_double_channel dst;
   3795 
   3796    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
   3797       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
   3798       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, TGSI_CHAN_Y);
   3799       fetch_double_channel(mach, &src[2], &inst->Src[2], TGSI_CHAN_X, TGSI_CHAN_Y);
   3800       op(&dst, src);
   3801       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
   3802    }
   3803    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
   3804       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
   3805       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_CHAN_W);
   3806       fetch_double_channel(mach, &src[2], &inst->Src[2], TGSI_CHAN_Z, TGSI_CHAN_W);
   3807       op(&dst, src);
   3808       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
   3809    }
   3810 }
   3811 
   3812 static void
   3813 exec_dldexp(struct tgsi_exec_machine *mach,
   3814             const struct tgsi_full_instruction *inst)
   3815 {
   3816    union tgsi_double_channel src0;
   3817    union tgsi_exec_channel src1;
   3818    union tgsi_double_channel dst;
   3819    int wmask;
   3820 
   3821    wmask = inst->Dst[0].Register.WriteMask;
   3822    if (wmask & TGSI_WRITEMASK_XY) {
   3823       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
   3824       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
   3825       micro_dldexp(&dst, &src0, &src1);
   3826       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
   3827    }
   3828 
   3829    if (wmask & TGSI_WRITEMASK_ZW) {
   3830       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
   3831       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_INT);
   3832       micro_dldexp(&dst, &src0, &src1);
   3833       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
   3834    }
   3835 }
   3836 
   3837 static void
   3838 exec_dfracexp(struct tgsi_exec_machine *mach,
   3839               const struct tgsi_full_instruction *inst)
   3840 {
   3841    union tgsi_double_channel src;
   3842    union tgsi_double_channel dst;
   3843    union tgsi_exec_channel dst_exp;
   3844 
   3845    if (((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY)) {
   3846       fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
   3847       micro_dfracexp(&dst, &dst_exp, &src);
   3848       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
   3849       store_dest(mach, &dst_exp, &inst->Dst[1], inst, ffs(inst->Dst[1].Register.WriteMask) - 1, TGSI_EXEC_DATA_INT);
   3850    }
   3851    if (((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW)) {
   3852       fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
   3853       micro_dfracexp(&dst, &dst_exp, &src);
   3854       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
   3855       store_dest(mach, &dst_exp, &inst->Dst[1], inst, ffs(inst->Dst[1].Register.WriteMask) - 1, TGSI_EXEC_DATA_INT);
   3856    }
   3857 }
   3858 
   3859 static void
   3860 exec_arg0_64_arg1_32(struct tgsi_exec_machine *mach,
   3861             const struct tgsi_full_instruction *inst,
   3862             micro_dop_sop op)
   3863 {
   3864    union tgsi_double_channel src0;
   3865    union tgsi_exec_channel src1;
   3866    union tgsi_double_channel dst;
   3867    int wmask;
   3868 
   3869    wmask = inst->Dst[0].Register.WriteMask;
   3870    if (wmask & TGSI_WRITEMASK_XY) {
   3871       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
   3872       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
   3873       op(&dst, &src0, &src1);
   3874       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
   3875    }
   3876 
   3877    if (wmask & TGSI_WRITEMASK_ZW) {
   3878       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
   3879       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_INT);
   3880       op(&dst, &src0, &src1);
   3881       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
   3882    }
   3883 }
   3884 
   3885 static int
   3886 get_image_coord_dim(unsigned tgsi_tex)
   3887 {
   3888    int dim;
   3889    switch (tgsi_tex) {
   3890    case TGSI_TEXTURE_BUFFER:
   3891    case TGSI_TEXTURE_1D:
   3892       dim = 1;
   3893       break;
   3894    case TGSI_TEXTURE_2D:
   3895    case TGSI_TEXTURE_RECT:
   3896    case TGSI_TEXTURE_1D_ARRAY:
   3897    case TGSI_TEXTURE_2D_MSAA:
   3898       dim = 2;
   3899       break;
   3900    case TGSI_TEXTURE_3D:
   3901    case TGSI_TEXTURE_CUBE:
   3902    case TGSI_TEXTURE_2D_ARRAY:
   3903    case TGSI_TEXTURE_2D_ARRAY_MSAA:
   3904    case TGSI_TEXTURE_CUBE_ARRAY:
   3905       dim = 3;
   3906       break;
   3907    default:
   3908       assert(!"unknown texture target");
   3909       dim = 0;
   3910       break;
   3911    }
   3912 
   3913    return dim;
   3914 }
   3915 
   3916 static int
   3917 get_image_coord_sample(unsigned tgsi_tex)
   3918 {
   3919    int sample = 0;
   3920    switch (tgsi_tex) {
   3921    case TGSI_TEXTURE_2D_MSAA:
   3922       sample = 3;
   3923       break;
   3924    case TGSI_TEXTURE_2D_ARRAY_MSAA:
   3925       sample = 4;
   3926       break;
   3927    default:
   3928       break;
   3929    }
   3930    return sample;
   3931 }
   3932 
   3933 static void
   3934 exec_load_img(struct tgsi_exec_machine *mach,
   3935               const struct tgsi_full_instruction *inst)
   3936 {
   3937    union tgsi_exec_channel r[4], sample_r;
   3938    uint unit;
   3939    int sample;
   3940    int i, j;
   3941    int dim;
   3942    uint chan;
   3943    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
   3944    struct tgsi_image_params params;
   3945    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
   3946 
   3947    unit = fetch_sampler_unit(mach, inst, 0);
   3948    dim = get_image_coord_dim(inst->Memory.Texture);
   3949    sample = get_image_coord_sample(inst->Memory.Texture);
   3950    assert(dim <= 3);
   3951 
   3952    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
   3953    params.unit = unit;
   3954    params.tgsi_tex_instr = inst->Memory.Texture;
   3955    params.format = inst->Memory.Format;
   3956 
   3957    for (i = 0; i < dim; i++) {
   3958       IFETCH(&r[i], 1, TGSI_CHAN_X + i);
   3959    }
   3960 
   3961    if (sample)
   3962       IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
   3963 
   3964    mach->Image->load(mach->Image, &params,
   3965                      r[0].i, r[1].i, r[2].i, sample_r.i,
   3966                      rgba);
   3967    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
   3968       r[0].f[j] = rgba[0][j];
   3969       r[1].f[j] = rgba[1][j];
   3970       r[2].f[j] = rgba[2][j];
   3971       r[3].f[j] = rgba[3][j];
   3972    }
   3973    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3974       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3975          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   3976       }
   3977    }
   3978 }
   3979 
   3980 static void
   3981 exec_load_buf(struct tgsi_exec_machine *mach,
   3982               const struct tgsi_full_instruction *inst)
   3983 {
   3984    union tgsi_exec_channel r[4];
   3985    uint unit;
   3986    int j;
   3987    uint chan;
   3988    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
   3989    struct tgsi_buffer_params params;
   3990    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
   3991 
   3992    unit = fetch_sampler_unit(mach, inst, 0);
   3993 
   3994    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
   3995    params.unit = unit;
   3996    IFETCH(&r[0], 1, TGSI_CHAN_X);
   3997 
   3998    mach->Buffer->load(mach->Buffer, &params,
   3999                       r[0].i, rgba);
   4000    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
   4001       r[0].f[j] = rgba[0][j];
   4002       r[1].f[j] = rgba[1][j];
   4003       r[2].f[j] = rgba[2][j];
   4004       r[3].f[j] = rgba[3][j];
   4005    }
   4006    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   4007       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   4008          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   4009       }
   4010    }
   4011 }
   4012 
   4013 static void
   4014 exec_load_mem(struct tgsi_exec_machine *mach,
   4015               const struct tgsi_full_instruction *inst)
   4016 {
   4017    union tgsi_exec_channel r[4];
   4018    uint chan;
   4019    char *ptr = mach->LocalMem;
   4020    uint32_t offset;
   4021    int j;
   4022 
   4023    IFETCH(&r[0], 1, TGSI_CHAN_X);
   4024    if (r[0].u[0] >= mach->LocalMemSize)
   4025       return;
   4026 
   4027    offset = r[0].u[0];
   4028    ptr += offset;
   4029 
   4030    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
   4031       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   4032          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   4033             memcpy(&r[chan].u[j], ptr + (4 * chan), 4);
   4034          }
   4035       }
   4036    }
   4037 
   4038    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   4039       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   4040          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   4041       }
   4042    }
   4043 }
   4044 
   4045 static void
   4046 exec_load(struct tgsi_exec_machine *mach,
   4047           const struct tgsi_full_instruction *inst)
   4048 {
   4049    if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
   4050       exec_load_img(mach, inst);
   4051    else if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
   4052       exec_load_buf(mach, inst);
   4053    else if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
   4054       exec_load_mem(mach, inst);
   4055 }
   4056 
   4057 static void
   4058 exec_store_img(struct tgsi_exec_machine *mach,
   4059                const struct tgsi_full_instruction *inst)
   4060 {
   4061    union tgsi_exec_channel r[3], sample_r;
   4062    union tgsi_exec_channel value[4];
   4063    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
   4064    struct tgsi_image_params params;
   4065    int dim;
   4066    int sample;
   4067    int i, j;
   4068    uint unit;
   4069    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
   4070    unit = inst->Dst[0].Register.Index;
   4071    dim = get_image_coord_dim(inst->Memory.Texture);
   4072    sample = get_image_coord_sample(inst->Memory.Texture);
   4073    assert(dim <= 3);
   4074 
   4075    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
   4076    params.unit = unit;
   4077    params.tgsi_tex_instr = inst->Memory.Texture;
   4078    params.format = inst->Memory.Format;
   4079 
   4080    for (i = 0; i < dim; i++) {
   4081       IFETCH(&r[i], 0, TGSI_CHAN_X + i);
   4082    }
   4083 
   4084    for (i = 0; i < 4; i++) {
   4085       FETCH(&value[i], 1, TGSI_CHAN_X + i);
   4086    }
   4087    if (sample)
   4088       IFETCH(&sample_r, 0, TGSI_CHAN_X + sample);
   4089 
   4090    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
   4091       rgba[0][j] = value[0].f[j];
   4092       rgba[1][j] = value[1].f[j];
   4093       rgba[2][j] = value[2].f[j];
   4094       rgba[3][j] = value[3].f[j];
   4095    }
   4096 
   4097    mach->Image->store(mach->Image, &params,
   4098                       r[0].i, r[1].i, r[2].i, sample_r.i,
   4099                       rgba);
   4100 }
   4101 
   4102 static void
   4103 exec_store_buf(struct tgsi_exec_machine *mach,
   4104                const struct tgsi_full_instruction *inst)
   4105 {
   4106    union tgsi_exec_channel r[3];
   4107    union tgsi_exec_channel value[4];
   4108    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
   4109    struct tgsi_buffer_params params;
   4110    int i, j;
   4111    uint unit;
   4112    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
   4113 
   4114    unit = inst->Dst[0].Register.Index;
   4115 
   4116    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
   4117    params.unit = unit;
   4118    params.writemask = inst->Dst[0].Register.WriteMask;
   4119 
   4120    IFETCH(&r[0], 0, TGSI_CHAN_X);
   4121    for (i = 0; i < 4; i++) {
   4122       FETCH(&value[i], 1, TGSI_CHAN_X + i);
   4123    }
   4124 
   4125    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
   4126       rgba[0][j] = value[0].f[j];
   4127       rgba[1][j] = value[1].f[j];
   4128       rgba[2][j] = value[2].f[j];
   4129       rgba[3][j] = value[3].f[j];
   4130    }
   4131 
   4132    mach->Buffer->store(mach->Buffer, &params,
   4133                       r[0].i,
   4134                       rgba);
   4135 }
   4136 
   4137 static void
   4138 exec_store_mem(struct tgsi_exec_machine *mach,
   4139                const struct tgsi_full_instruction *inst)
   4140 {
   4141    union tgsi_exec_channel r[3];
   4142    union tgsi_exec_channel value[4];
   4143    uint i, chan;
   4144    char *ptr = mach->LocalMem;
   4145    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
   4146    int execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
   4147 
   4148    IFETCH(&r[0], 0, TGSI_CHAN_X);
   4149 
   4150    for (i = 0; i < 4; i++) {
   4151       FETCH(&value[i], 1, TGSI_CHAN_X + i);
   4152    }
   4153 
   4154    if (r[0].u[0] >= mach->LocalMemSize)
   4155       return;
   4156    ptr += r[0].u[0];
   4157 
   4158    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   4159       if (execmask & (1 << i)) {
   4160          for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   4161             if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   4162                memcpy(ptr + (chan * 4), &value[chan].u[0], 4);
   4163             }
   4164          }
   4165       }
   4166    }
   4167 }
   4168 
   4169 static void
   4170 exec_store(struct tgsi_exec_machine *mach,
   4171            const struct tgsi_full_instruction *inst)
   4172 {
   4173    if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE)
   4174       exec_store_img(mach, inst);
   4175    else if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER)
   4176       exec_store_buf(mach, inst);
   4177    else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY)
   4178       exec_store_mem(mach, inst);
   4179 }
   4180 
   4181 static void
   4182 exec_atomop_img(struct tgsi_exec_machine *mach,
   4183                 const struct tgsi_full_instruction *inst)
   4184 {
   4185    union tgsi_exec_channel r[4], sample_r;
   4186    union tgsi_exec_channel value[4], value2[4];
   4187    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
   4188    float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
   4189    struct tgsi_image_params params;
   4190    int dim;
   4191    int sample;
   4192    int i, j;
   4193    uint unit, chan;
   4194    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
   4195    unit = fetch_sampler_unit(mach, inst, 0);
   4196    dim = get_image_coord_dim(inst->Memory.Texture);
   4197    sample = get_image_coord_sample(inst->Memory.Texture);
   4198    assert(dim <= 3);
   4199 
   4200    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
   4201    params.unit = unit;
   4202    params.tgsi_tex_instr = inst->Memory.Texture;
   4203    params.format = inst->Memory.Format;
   4204 
   4205    for (i = 0; i < dim; i++) {
   4206       IFETCH(&r[i], 1, TGSI_CHAN_X + i);
   4207    }
   4208 
   4209    for (i = 0; i < 4; i++) {
   4210       FETCH(&value[i], 2, TGSI_CHAN_X + i);
   4211       if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
   4212          FETCH(&value2[i], 3, TGSI_CHAN_X + i);
   4213    }
   4214    if (sample)
   4215       IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
   4216 
   4217    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
   4218       rgba[0][j] = value[0].f[j];
   4219       rgba[1][j] = value[1].f[j];
   4220       rgba[2][j] = value[2].f[j];
   4221       rgba[3][j] = value[3].f[j];
   4222    }
   4223    if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
   4224       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
   4225          rgba2[0][j] = value2[0].f[j];
   4226          rgba2[1][j] = value2[1].f[j];
   4227          rgba2[2][j] = value2[2].f[j];
   4228          rgba2[3][j] = value2[3].f[j];
   4229       }
   4230    }
   4231 
   4232    mach->Image->op(mach->Image, &params, inst->Instruction.Opcode,
   4233                    r[0].i, r[1].i, r[2].i, sample_r.i,
   4234                    rgba, rgba2);
   4235 
   4236    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
   4237       r[0].f[j] = rgba[0][j];
   4238       r[1].f[j] = rgba[1][j];
   4239       r[2].f[j] = rgba[2][j];
   4240       r[3].f[j] = rgba[3][j];
   4241    }
   4242    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   4243       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   4244          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   4245       }
   4246    }
   4247 }
   4248 
   4249 static void
   4250 exec_atomop_buf(struct tgsi_exec_machine *mach,
   4251                 const struct tgsi_full_instruction *inst)
   4252 {
   4253    union tgsi_exec_channel r[4];
   4254    union tgsi_exec_channel value[4], value2[4];
   4255    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
   4256    float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
   4257    struct tgsi_buffer_params params;
   4258    int i, j;
   4259    uint unit, chan;
   4260    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
   4261 
   4262    unit = fetch_sampler_unit(mach, inst, 0);
   4263 
   4264    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
   4265    params.unit = unit;
   4266    params.writemask = inst->Dst[0].Register.WriteMask;
   4267 
   4268    IFETCH(&r[0], 1, TGSI_CHAN_X);
   4269 
   4270    for (i = 0; i < 4; i++) {
   4271       FETCH(&value[i], 2, TGSI_CHAN_X + i);
   4272       if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
   4273          FETCH(&value2[i], 3, TGSI_CHAN_X + i);
   4274    }
   4275 
   4276    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
   4277       rgba[0][j] = value[0].f[j];
   4278       rgba[1][j] = value[1].f[j];
   4279       rgba[2][j] = value[2].f[j];
   4280       rgba[3][j] = value[3].f[j];
   4281    }
   4282    if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
   4283       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
   4284          rgba2[0][j] = value2[0].f[j];
   4285          rgba2[1][j] = value2[1].f[j];
   4286          rgba2[2][j] = value2[2].f[j];
   4287          rgba2[3][j] = value2[3].f[j];
   4288       }
   4289    }
   4290 
   4291    mach->Buffer->op(mach->Buffer, &params, inst->Instruction.Opcode,
   4292                    r[0].i,
   4293                    rgba, rgba2);
   4294 
   4295    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
   4296       r[0].f[j] = rgba[0][j];
   4297       r[1].f[j] = rgba[1][j];
   4298       r[2].f[j] = rgba[2][j];
   4299       r[3].f[j] = rgba[3][j];
   4300    }
   4301    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   4302       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   4303          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   4304       }
   4305    }
   4306 }
   4307 
   4308 static void
   4309 exec_atomop_mem(struct tgsi_exec_machine *mach,
   4310                 const struct tgsi_full_instruction *inst)
   4311 {
   4312    union tgsi_exec_channel r[4];
   4313    union tgsi_exec_channel value[4], value2[4];
   4314    char *ptr = mach->LocalMem;
   4315    uint32_t val;
   4316    uint chan, i;
   4317    uint32_t offset;
   4318    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
   4319    int execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
   4320    IFETCH(&r[0], 1, TGSI_CHAN_X);
   4321 
   4322    if (r[0].u[0] >= mach->LocalMemSize)
   4323       return;
   4324 
   4325    offset = r[0].u[0];
   4326    ptr += offset;
   4327    for (i = 0; i < 4; i++) {
   4328       FETCH(&value[i], 2, TGSI_CHAN_X + i);
   4329       if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
   4330          FETCH(&value2[i], 3, TGSI_CHAN_X + i);
   4331    }
   4332 
   4333    memcpy(&r[0].u[0], ptr, 4);
   4334    val = r[0].u[0];
   4335    switch (inst->Instruction.Opcode) {
   4336    case TGSI_OPCODE_ATOMUADD:
   4337       val += value[0].u[0];
   4338       break;
   4339    case TGSI_OPCODE_ATOMXOR:
   4340       val ^= value[0].u[0];
   4341       break;
   4342    case TGSI_OPCODE_ATOMOR:
   4343       val |= value[0].u[0];
   4344       break;
   4345    case TGSI_OPCODE_ATOMAND:
   4346       val &= value[0].u[0];
   4347       break;
   4348    case TGSI_OPCODE_ATOMUMIN:
   4349       val = MIN2(val, value[0].u[0]);
   4350       break;
   4351    case TGSI_OPCODE_ATOMUMAX:
   4352       val = MAX2(val, value[0].u[0]);
   4353       break;
   4354    case TGSI_OPCODE_ATOMIMIN:
   4355       val = MIN2(r[0].i[0], value[0].i[0]);
   4356       break;
   4357    case TGSI_OPCODE_ATOMIMAX:
   4358       val = MAX2(r[0].i[0], value[0].i[0]);
   4359       break;
   4360    case TGSI_OPCODE_ATOMXCHG:
   4361       val = value[0].i[0];
   4362       break;
   4363    case TGSI_OPCODE_ATOMCAS:
   4364       if (val == value[0].u[0])
   4365          val = value2[0].u[0];
   4366       break;
   4367    default:
   4368       break;
   4369    }
   4370    for (i = 0; i < TGSI_QUAD_SIZE; i++)
   4371       if (execmask & (1 << i))
   4372          memcpy(ptr, &val, 4);
   4373 
   4374    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   4375       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   4376          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   4377       }
   4378    }
   4379 }
   4380 
   4381 static void
   4382 exec_atomop(struct tgsi_exec_machine *mach,
   4383             const struct tgsi_full_instruction *inst)
   4384 {
   4385    if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
   4386       exec_atomop_img(mach, inst);
   4387    else if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
   4388       exec_atomop_buf(mach, inst);
   4389    else if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
   4390       exec_atomop_mem(mach, inst);
   4391 }
   4392 
   4393 static void
   4394 exec_resq_img(struct tgsi_exec_machine *mach,
   4395               const struct tgsi_full_instruction *inst)
   4396 {
   4397    int result[4];
   4398    union tgsi_exec_channel r[4];
   4399    uint unit;
   4400    int i, chan, j;
   4401    struct tgsi_image_params params;
   4402    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
   4403 
   4404    unit = fetch_sampler_unit(mach, inst, 0);
   4405 
   4406    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
   4407    params.unit = unit;
   4408    params.tgsi_tex_instr = inst->Memory.Texture;
   4409    params.format = inst->Memory.Format;
   4410 
   4411    mach->Image->get_dims(mach->Image, &params, result);
   4412 
   4413    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   4414       for (j = 0; j < 4; j++) {
   4415          r[j].i[i] = result[j];
   4416       }
   4417    }
   4418 
   4419    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   4420       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   4421          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
   4422                     TGSI_EXEC_DATA_INT);
   4423       }
   4424    }
   4425 }
   4426 
   4427 static void
   4428 exec_resq_buf(struct tgsi_exec_machine *mach,
   4429               const struct tgsi_full_instruction *inst)
   4430 {
   4431    int result;
   4432    union tgsi_exec_channel r[4];
   4433    uint unit;
   4434    int i, chan;
   4435    struct tgsi_buffer_params params;
   4436    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
   4437 
   4438    unit = fetch_sampler_unit(mach, inst, 0);
   4439 
   4440    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
   4441    params.unit = unit;
   4442 
   4443    mach->Buffer->get_dims(mach->Buffer, &params, &result);
   4444 
   4445    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   4446       r[0].i[i] = result;
   4447    }
   4448 
   4449    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   4450       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   4451          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
   4452                     TGSI_EXEC_DATA_INT);
   4453       }
   4454    }
   4455 }
   4456 
   4457 static void
   4458 exec_resq(struct tgsi_exec_machine *mach,
   4459           const struct tgsi_full_instruction *inst)
   4460 {
   4461    if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
   4462       exec_resq_img(mach, inst);
   4463    else
   4464       exec_resq_buf(mach, inst);
   4465 }
   4466 
   4467 static void
   4468 micro_f2u64(union tgsi_double_channel *dst,
   4469             const union tgsi_exec_channel *src)
   4470 {
   4471    dst->u64[0] = (uint64_t)src->f[0];
   4472    dst->u64[1] = (uint64_t)src->f[1];
   4473    dst->u64[2] = (uint64_t)src->f[2];
   4474    dst->u64[3] = (uint64_t)src->f[3];
   4475 }
   4476 
   4477 static void
   4478 micro_f2i64(union tgsi_double_channel *dst,
   4479             const union tgsi_exec_channel *src)
   4480 {
   4481    dst->i64[0] = (int64_t)src->f[0];
   4482    dst->i64[1] = (int64_t)src->f[1];
   4483    dst->i64[2] = (int64_t)src->f[2];
   4484    dst->i64[3] = (int64_t)src->f[3];
   4485 }
   4486 
   4487 static void
   4488 micro_u2i64(union tgsi_double_channel *dst,
   4489             const union tgsi_exec_channel *src)
   4490 {
   4491    dst->u64[0] = (uint64_t)src->u[0];
   4492    dst->u64[1] = (uint64_t)src->u[1];
   4493    dst->u64[2] = (uint64_t)src->u[2];
   4494    dst->u64[3] = (uint64_t)src->u[3];
   4495 }
   4496 
   4497 static void
   4498 micro_i2i64(union tgsi_double_channel *dst,
   4499             const union tgsi_exec_channel *src)
   4500 {
   4501    dst->i64[0] = (int64_t)src->i[0];
   4502    dst->i64[1] = (int64_t)src->i[1];
   4503    dst->i64[2] = (int64_t)src->i[2];
   4504    dst->i64[3] = (int64_t)src->i[3];
   4505 }
   4506 
   4507 static void
   4508 micro_d2u64(union tgsi_double_channel *dst,
   4509            const union tgsi_double_channel *src)
   4510 {
   4511    dst->u64[0] = (uint64_t)src->d[0];
   4512    dst->u64[1] = (uint64_t)src->d[1];
   4513    dst->u64[2] = (uint64_t)src->d[2];
   4514    dst->u64[3] = (uint64_t)src->d[3];
   4515 }
   4516 
   4517 static void
   4518 micro_d2i64(union tgsi_double_channel *dst,
   4519            const union tgsi_double_channel *src)
   4520 {
   4521    dst->i64[0] = (int64_t)src->d[0];
   4522    dst->i64[1] = (int64_t)src->d[1];
   4523    dst->i64[2] = (int64_t)src->d[2];
   4524    dst->i64[3] = (int64_t)src->d[3];
   4525 }
   4526 
   4527 static void
   4528 micro_u642d(union tgsi_double_channel *dst,
   4529            const union tgsi_double_channel *src)
   4530 {
   4531    dst->d[0] = (double)src->u64[0];
   4532    dst->d[1] = (double)src->u64[1];
   4533    dst->d[2] = (double)src->u64[2];
   4534    dst->d[3] = (double)src->u64[3];
   4535 }
   4536 
   4537 static void
   4538 micro_i642d(union tgsi_double_channel *dst,
   4539            const union tgsi_double_channel *src)
   4540 {
   4541    dst->d[0] = (double)src->i64[0];
   4542    dst->d[1] = (double)src->i64[1];
   4543    dst->d[2] = (double)src->i64[2];
   4544    dst->d[3] = (double)src->i64[3];
   4545 }
   4546 
   4547 static void
   4548 micro_u642f(union tgsi_exec_channel *dst,
   4549             const union tgsi_double_channel *src)
   4550 {
   4551    dst->f[0] = (float)src->u64[0];
   4552    dst->f[1] = (float)src->u64[1];
   4553    dst->f[2] = (float)src->u64[2];
   4554    dst->f[3] = (float)src->u64[3];
   4555 }
   4556 
   4557 static void
   4558 micro_i642f(union tgsi_exec_channel *dst,
   4559             const union tgsi_double_channel *src)
   4560 {
   4561    dst->f[0] = (float)src->i64[0];
   4562    dst->f[1] = (float)src->i64[1];
   4563    dst->f[2] = (float)src->i64[2];
   4564    dst->f[3] = (float)src->i64[3];
   4565 }
   4566 
   4567 static void
   4568 exec_t_2_64(struct tgsi_exec_machine *mach,
   4569           const struct tgsi_full_instruction *inst,
   4570           micro_dop_s op,
   4571           enum tgsi_exec_datatype src_datatype)
   4572 {
   4573    union tgsi_exec_channel src;
   4574    union tgsi_double_channel dst;
   4575 
   4576    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
   4577       fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, src_datatype);
   4578       op(&dst, &src);
   4579       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
   4580    }
   4581    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
   4582       fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_Y, src_datatype);
   4583       op(&dst, &src);
   4584       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
   4585    }
   4586 }
   4587 
   4588 static void
   4589 exec_64_2_t(struct tgsi_exec_machine *mach,
   4590             const struct tgsi_full_instruction *inst,
   4591             micro_sop_d op,
   4592             enum tgsi_exec_datatype dst_datatype)
   4593 {
   4594    union tgsi_double_channel src;
   4595    union tgsi_exec_channel dst;
   4596    int wm = inst->Dst[0].Register.WriteMask;
   4597    int i;
   4598    int bit;
   4599    for (i = 0; i < 2; i++) {
   4600       bit = ffs(wm);
   4601       if (bit) {
   4602          wm &= ~(1 << (bit - 1));
   4603          if (i == 0)
   4604             fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
   4605          else
   4606             fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
   4607          op(&dst, &src);
   4608          store_dest(mach, &dst, &inst->Dst[0], inst, bit - 1, dst_datatype);
   4609       }
   4610    }
   4611 }
   4612 
   4613 static void
   4614 micro_i2f(union tgsi_exec_channel *dst,
   4615           const union tgsi_exec_channel *src)
   4616 {
   4617    dst->f[0] = (float)src->i[0];
   4618    dst->f[1] = (float)src->i[1];
   4619    dst->f[2] = (float)src->i[2];
   4620    dst->f[3] = (float)src->i[3];
   4621 }
   4622 
   4623 static void
   4624 micro_not(union tgsi_exec_channel *dst,
   4625           const union tgsi_exec_channel *src)
   4626 {
   4627    dst->u[0] = ~src->u[0];
   4628    dst->u[1] = ~src->u[1];
   4629    dst->u[2] = ~src->u[2];
   4630    dst->u[3] = ~src->u[3];
   4631 }
   4632 
   4633 static void
   4634 micro_shl(union tgsi_exec_channel *dst,
   4635           const union tgsi_exec_channel *src0,
   4636           const union tgsi_exec_channel *src1)
   4637 {
   4638    unsigned masked_count;
   4639    masked_count = src1->u[0] & 0x1f;
   4640    dst->u[0] = src0->u[0] << masked_count;
   4641    masked_count = src1->u[1] & 0x1f;
   4642    dst->u[1] = src0->u[1] << masked_count;
   4643    masked_count = src1->u[2] & 0x1f;
   4644    dst->u[2] = src0->u[2] << masked_count;
   4645    masked_count = src1->u[3] & 0x1f;
   4646    dst->u[3] = src0->u[3] << masked_count;
   4647 }
   4648 
   4649 static void
   4650 micro_and(union tgsi_exec_channel *dst,
   4651           const union tgsi_exec_channel *src0,
   4652           const union tgsi_exec_channel *src1)
   4653 {
   4654    dst->u[0] = src0->u[0] & src1->u[0];
   4655    dst->u[1] = src0->u[1] & src1->u[1];
   4656    dst->u[2] = src0->u[2] & src1->u[2];
   4657    dst->u[3] = src0->u[3] & src1->u[3];
   4658 }
   4659 
   4660 static void
   4661 micro_or(union tgsi_exec_channel *dst,
   4662          const union tgsi_exec_channel *src0,
   4663          const union tgsi_exec_channel *src1)
   4664 {
   4665    dst->u[0] = src0->u[0] | src1->u[0];
   4666    dst->u[1] = src0->u[1] | src1->u[1];
   4667    dst->u[2] = src0->u[2] | src1->u[2];
   4668    dst->u[3] = src0->u[3] | src1->u[3];
   4669 }
   4670 
   4671 static void
   4672 micro_xor(union tgsi_exec_channel *dst,
   4673           const union tgsi_exec_channel *src0,
   4674           const union tgsi_exec_channel *src1)
   4675 {
   4676    dst->u[0] = src0->u[0] ^ src1->u[0];
   4677    dst->u[1] = src0->u[1] ^ src1->u[1];
   4678    dst->u[2] = src0->u[2] ^ src1->u[2];
   4679    dst->u[3] = src0->u[3] ^ src1->u[3];
   4680 }
   4681 
   4682 static void
   4683 micro_mod(union tgsi_exec_channel *dst,
   4684           const union tgsi_exec_channel *src0,
   4685           const union tgsi_exec_channel *src1)
   4686 {
   4687    dst->i[0] = src0->i[0] % src1->i[0];
   4688    dst->i[1] = src0->i[1] % src1->i[1];
   4689    dst->i[2] = src0->i[2] % src1->i[2];
   4690    dst->i[3] = src0->i[3] % src1->i[3];
   4691 }
   4692 
   4693 static void
   4694 micro_f2i(union tgsi_exec_channel *dst,
   4695           const union tgsi_exec_channel *src)
   4696 {
   4697    dst->i[0] = (int)src->f[0];
   4698    dst->i[1] = (int)src->f[1];
   4699    dst->i[2] = (int)src->f[2];
   4700    dst->i[3] = (int)src->f[3];
   4701 }
   4702 
   4703 static void
   4704 micro_fseq(union tgsi_exec_channel *dst,
   4705            const union tgsi_exec_channel *src0,
   4706            const union tgsi_exec_channel *src1)
   4707 {
   4708    dst->u[0] = src0->f[0] == src1->f[0] ? ~0 : 0;
   4709    dst->u[1] = src0->f[1] == src1->f[1] ? ~0 : 0;
   4710    dst->u[2] = src0->f[2] == src1->f[2] ? ~0 : 0;
   4711    dst->u[3] = src0->f[3] == src1->f[3] ? ~0 : 0;
   4712 }
   4713 
   4714 static void
   4715 micro_fsge(union tgsi_exec_channel *dst,
   4716            const union tgsi_exec_channel *src0,
   4717            const union tgsi_exec_channel *src1)
   4718 {
   4719    dst->u[0] = src0->f[0] >= src1->f[0] ? ~0 : 0;
   4720    dst->u[1] = src0->f[1] >= src1->f[1] ? ~0 : 0;
   4721    dst->u[2] = src0->f[2] >= src1->f[2] ? ~0 : 0;
   4722    dst->u[3] = src0->f[3] >= src1->f[3] ? ~0 : 0;
   4723 }
   4724 
   4725 static void
   4726 micro_fslt(union tgsi_exec_channel *dst,
   4727            const union tgsi_exec_channel *src0,
   4728            const union tgsi_exec_channel *src1)
   4729 {
   4730    dst->u[0] = src0->f[0] < src1->f[0] ? ~0 : 0;
   4731    dst->u[1] = src0->f[1] < src1->f[1] ? ~0 : 0;
   4732    dst->u[2] = src0->f[2] < src1->f[2] ? ~0 : 0;
   4733    dst->u[3] = src0->f[3] < src1->f[3] ? ~0 : 0;
   4734 }
   4735 
   4736 static void
   4737 micro_fsne(union tgsi_exec_channel *dst,
   4738            const union tgsi_exec_channel *src0,
   4739            const union tgsi_exec_channel *src1)
   4740 {
   4741    dst->u[0] = src0->f[0] != src1->f[0] ? ~0 : 0;
   4742    dst->u[1] = src0->f[1] != src1->f[1] ? ~0 : 0;
   4743    dst->u[2] = src0->f[2] != src1->f[2] ? ~0 : 0;
   4744    dst->u[3] = src0->f[3] != src1->f[3] ? ~0 : 0;
   4745 }
   4746 
   4747 static void
   4748 micro_idiv(union tgsi_exec_channel *dst,
   4749            const union tgsi_exec_channel *src0,
   4750            const union tgsi_exec_channel *src1)
   4751 {
   4752    dst->i[0] = src1->i[0] ? src0->i[0] / src1->i[0] : 0;
   4753    dst->i[1] = src1->i[1] ? src0->i[1] / src1->i[1] : 0;
   4754    dst->i[2] = src1->i[2] ? src0->i[2] / src1->i[2] : 0;
   4755    dst->i[3] = src1->i[3] ? src0->i[3] / src1->i[3] : 0;
   4756 }
   4757 
   4758 static void
   4759 micro_imax(union tgsi_exec_channel *dst,
   4760            const union tgsi_exec_channel *src0,
   4761            const union tgsi_exec_channel *src1)
   4762 {
   4763    dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
   4764    dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
   4765    dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
   4766    dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
   4767 }
   4768 
   4769 static void
   4770 micro_imin(union tgsi_exec_channel *dst,
   4771            const union tgsi_exec_channel *src0,
   4772            const union tgsi_exec_channel *src1)
   4773 {
   4774    dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
   4775    dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
   4776    dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
   4777    dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
   4778 }
   4779 
   4780 static void
   4781 micro_isge(union tgsi_exec_channel *dst,
   4782            const union tgsi_exec_channel *src0,
   4783            const union tgsi_exec_channel *src1)
   4784 {
   4785    dst->i[0] = src0->i[0] >= src1->i[0] ? -1 : 0;
   4786    dst->i[1] = src0->i[1] >= src1->i[1] ? -1 : 0;
   4787    dst->i[2] = src0->i[2] >= src1->i[2] ? -1 : 0;
   4788    dst->i[3] = src0->i[3] >= src1->i[3] ? -1 : 0;
   4789 }
   4790 
   4791 static void
   4792 micro_ishr(union tgsi_exec_channel *dst,
   4793            const union tgsi_exec_channel *src0,
   4794            const union tgsi_exec_channel *src1)
   4795 {
   4796    unsigned masked_count;
   4797    masked_count = src1->i[0] & 0x1f;
   4798    dst->i[0] = src0->i[0] >> masked_count;
   4799    masked_count = src1->i[1] & 0x1f;
   4800    dst->i[1] = src0->i[1] >> masked_count;
   4801    masked_count = src1->i[2] & 0x1f;
   4802    dst->i[2] = src0->i[2] >> masked_count;
   4803    masked_count = src1->i[3] & 0x1f;
   4804    dst->i[3] = src0->i[3] >> masked_count;
   4805 }
   4806 
   4807 static void
   4808 micro_islt(union tgsi_exec_channel *dst,
   4809            const union tgsi_exec_channel *src0,
   4810            const union tgsi_exec_channel *src1)
   4811 {
   4812    dst->i[0] = src0->i[0] < src1->i[0] ? -1 : 0;
   4813    dst->i[1] = src0->i[1] < src1->i[1] ? -1 : 0;
   4814    dst->i[2] = src0->i[2] < src1->i[2] ? -1 : 0;
   4815    dst->i[3] = src0->i[3] < src1->i[3] ? -1 : 0;
   4816 }
   4817 
   4818 static void
   4819 micro_f2u(union tgsi_exec_channel *dst,
   4820           const union tgsi_exec_channel *src)
   4821 {
   4822    dst->u[0] = (uint)src->f[0];
   4823    dst->u[1] = (uint)src->f[1];
   4824    dst->u[2] = (uint)src->f[2];
   4825    dst->u[3] = (uint)src->f[3];
   4826 }
   4827 
   4828 static void
   4829 micro_u2f(union tgsi_exec_channel *dst,
   4830           const union tgsi_exec_channel *src)
   4831 {
   4832    dst->f[0] = (float)src->u[0];
   4833    dst->f[1] = (float)src->u[1];
   4834    dst->f[2] = (float)src->u[2];
   4835    dst->f[3] = (float)src->u[3];
   4836 }
   4837 
   4838 static void
   4839 micro_uadd(union tgsi_exec_channel *dst,
   4840            const union tgsi_exec_channel *src0,
   4841            const union tgsi_exec_channel *src1)
   4842 {
   4843    dst->u[0] = src0->u[0] + src1->u[0];
   4844    dst->u[1] = src0->u[1] + src1->u[1];
   4845    dst->u[2] = src0->u[2] + src1->u[2];
   4846    dst->u[3] = src0->u[3] + src1->u[3];
   4847 }
   4848 
   4849 static void
   4850 micro_udiv(union tgsi_exec_channel *dst,
   4851            const union tgsi_exec_channel *src0,
   4852            const union tgsi_exec_channel *src1)
   4853 {
   4854    dst->u[0] = src1->u[0] ? src0->u[0] / src1->u[0] : ~0u;
   4855    dst->u[1] = src1->u[1] ? src0->u[1] / src1->u[1] : ~0u;
   4856    dst->u[2] = src1->u[2] ? src0->u[2] / src1->u[2] : ~0u;
   4857    dst->u[3] = src1->u[3] ? src0->u[3] / src1->u[3] : ~0u;
   4858 }
   4859 
   4860 static void
   4861 micro_umad(union tgsi_exec_channel *dst,
   4862            const union tgsi_exec_channel *src0,
   4863            const union tgsi_exec_channel *src1,
   4864            const union tgsi_exec_channel *src2)
   4865 {
   4866    dst->u[0] = src0->u[0] * src1->u[0] + src2->u[0];
   4867    dst->u[1] = src0->u[1] * src1->u[1] + src2->u[1];
   4868    dst->u[2] = src0->u[2] * src1->u[2] + src2->u[2];
   4869    dst->u[3] = src0->u[3] * src1->u[3] + src2->u[3];
   4870 }
   4871 
   4872 static void
   4873 micro_umax(union tgsi_exec_channel *dst,
   4874            const union tgsi_exec_channel *src0,
   4875            const union tgsi_exec_channel *src1)
   4876 {
   4877    dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
   4878    dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
   4879    dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
   4880    dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
   4881 }
   4882 
   4883 static void
   4884 micro_umin(union tgsi_exec_channel *dst,
   4885            const union tgsi_exec_channel *src0,
   4886            const union tgsi_exec_channel *src1)
   4887 {
   4888    dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
   4889    dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
   4890    dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
   4891    dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
   4892 }
   4893 
   4894 static void
   4895 micro_umod(union tgsi_exec_channel *dst,
   4896            const union tgsi_exec_channel *src0,
   4897            const union tgsi_exec_channel *src1)
   4898 {
   4899    dst->u[0] = src1->u[0] ? src0->u[0] % src1->u[0] : ~0u;
   4900    dst->u[1] = src1->u[1] ? src0->u[1] % src1->u[1] : ~0u;
   4901    dst->u[2] = src1->u[2] ? src0->u[2] % src1->u[2] : ~0u;
   4902    dst->u[3] = src1->u[3] ? src0->u[3] % src1->u[3] : ~0u;
   4903 }
   4904 
   4905 static void
   4906 micro_umul(union tgsi_exec_channel *dst,
   4907            const union tgsi_exec_channel *src0,
   4908            const union tgsi_exec_channel *src1)
   4909 {
   4910    dst->u[0] = src0->u[0] * src1->u[0];
   4911    dst->u[1] = src0->u[1] * src1->u[1];
   4912    dst->u[2] = src0->u[2] * src1->u[2];
   4913    dst->u[3] = src0->u[3] * src1->u[3];
   4914 }
   4915 
   4916 static void
   4917 micro_imul_hi(union tgsi_exec_channel *dst,
   4918               const union tgsi_exec_channel *src0,
   4919               const union tgsi_exec_channel *src1)
   4920 {
   4921 #define I64M(x, y) ((((int64_t)x) * ((int64_t)y)) >> 32)
   4922    dst->i[0] = I64M(src0->i[0], src1->i[0]);
   4923    dst->i[1] = I64M(src0->i[1], src1->i[1]);
   4924    dst->i[2] = I64M(src0->i[2], src1->i[2]);
   4925    dst->i[3] = I64M(src0->i[3], src1->i[3]);
   4926 #undef I64M
   4927 }
   4928 
   4929 static void
   4930 micro_umul_hi(union tgsi_exec_channel *dst,
   4931               const union tgsi_exec_channel *src0,
   4932               const union tgsi_exec_channel *src1)
   4933 {
   4934 #define U64M(x, y) ((((uint64_t)x) * ((uint64_t)y)) >> 32)
   4935    dst->u[0] = U64M(src0->u[0], src1->u[0]);
   4936    dst->u[1] = U64M(src0->u[1], src1->u[1]);
   4937    dst->u[2] = U64M(src0->u[2], src1->u[2]);
   4938    dst->u[3] = U64M(src0->u[3], src1->u[3]);
   4939 #undef U64M
   4940 }
   4941 
   4942 static void
   4943 micro_useq(union tgsi_exec_channel *dst,
   4944            const union tgsi_exec_channel *src0,
   4945            const union tgsi_exec_channel *src1)
   4946 {
   4947    dst->u[0] = src0->u[0] == src1->u[0] ? ~0 : 0;
   4948    dst->u[1] = src0->u[1] == src1->u[1] ? ~0 : 0;
   4949    dst->u[2] = src0->u[2] == src1->u[2] ? ~0 : 0;
   4950    dst->u[3] = src0->u[3] == src1->u[3] ? ~0 : 0;
   4951 }
   4952 
   4953 static void
   4954 micro_usge(union tgsi_exec_channel *dst,
   4955            const union tgsi_exec_channel *src0,
   4956            const union tgsi_exec_channel *src1)
   4957 {
   4958    dst->u[0] = src0->u[0] >= src1->u[0] ? ~0 : 0;
   4959    dst->u[1] = src0->u[1] >= src1->u[1] ? ~0 : 0;
   4960    dst->u[2] = src0->u[2] >= src1->u[2] ? ~0 : 0;
   4961    dst->u[3] = src0->u[3] >= src1->u[3] ? ~0 : 0;
   4962 }
   4963 
   4964 static void
   4965 micro_ushr(union tgsi_exec_channel *dst,
   4966            const union tgsi_exec_channel *src0,
   4967            const union tgsi_exec_channel *src1)
   4968 {
   4969    unsigned masked_count;
   4970    masked_count = src1->u[0] & 0x1f;
   4971    dst->u[0] = src0->u[0] >> masked_count;
   4972    masked_count = src1->u[1] & 0x1f;
   4973    dst->u[1] = src0->u[1] >> masked_count;
   4974    masked_count = src1->u[2] & 0x1f;
   4975    dst->u[2] = src0->u[2] >> masked_count;
   4976    masked_count = src1->u[3] & 0x1f;
   4977    dst->u[3] = src0->u[3] >> masked_count;
   4978 }
   4979 
   4980 static void
   4981 micro_uslt(union tgsi_exec_channel *dst,
   4982            const union tgsi_exec_channel *src0,
   4983            const union tgsi_exec_channel *src1)
   4984 {
   4985    dst->u[0] = src0->u[0] < src1->u[0] ? ~0 : 0;
   4986    dst->u[1] = src0->u[1] < src1->u[1] ? ~0 : 0;
   4987    dst->u[2] = src0->u[2] < src1->u[2] ? ~0 : 0;
   4988    dst->u[3] = src0->u[3] < src1->u[3] ? ~0 : 0;
   4989 }
   4990 
   4991 static void
   4992 micro_usne(union tgsi_exec_channel *dst,
   4993            const union tgsi_exec_channel *src0,
   4994            const union tgsi_exec_channel *src1)
   4995 {
   4996    dst->u[0] = src0->u[0] != src1->u[0] ? ~0 : 0;
   4997    dst->u[1] = src0->u[1] != src1->u[1] ? ~0 : 0;
   4998    dst->u[2] = src0->u[2] != src1->u[2] ? ~0 : 0;
   4999    dst->u[3] = src0->u[3] != src1->u[3] ? ~0 : 0;
   5000 }
   5001 
   5002 static void
   5003 micro_uarl(union tgsi_exec_channel *dst,
   5004            const union tgsi_exec_channel *src)
   5005 {
   5006    dst->i[0] = src->u[0];
   5007    dst->i[1] = src->u[1];
   5008    dst->i[2] = src->u[2];
   5009    dst->i[3] = src->u[3];
   5010 }
   5011 
   5012 static void
   5013 micro_ucmp(union tgsi_exec_channel *dst,
   5014            const union tgsi_exec_channel *src0,
   5015            const union tgsi_exec_channel *src1,
   5016            const union tgsi_exec_channel *src2)
   5017 {
   5018    dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
   5019    dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
   5020    dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
   5021    dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
   5022 }
   5023 
   5024 /**
   5025  * Signed bitfield extract (i.e. sign-extend the extracted bits)
   5026  */
   5027 static void
   5028 micro_ibfe(union tgsi_exec_channel *dst,
   5029            const union tgsi_exec_channel *src0,
   5030            const union tgsi_exec_channel *src1,
   5031            const union tgsi_exec_channel *src2)
   5032 {
   5033    int i;
   5034    for (i = 0; i < 4; i++) {
   5035       int width = src2->i[i] & 0x1f;
   5036       int offset = src1->i[i] & 0x1f;
   5037       if (width == 0)
   5038          dst->i[i] = 0;
   5039       else if (width + offset < 32)
   5040          dst->i[i] = (src0->i[i] << (32 - width - offset)) >> (32 - width);
   5041       else
   5042          dst->i[i] = src0->i[i] >> offset;
   5043    }
   5044 }
   5045 
   5046 /**
   5047  * Unsigned bitfield extract
   5048  */
   5049 static void
   5050 micro_ubfe(union tgsi_exec_channel *dst,
   5051            const union tgsi_exec_channel *src0,
   5052            const union tgsi_exec_channel *src1,
   5053            const union tgsi_exec_channel *src2)
   5054 {
   5055    int i;
   5056    for (i = 0; i < 4; i++) {
   5057       int width = src2->u[i] & 0x1f;
   5058       int offset = src1->u[i] & 0x1f;
   5059       if (width == 0)
   5060          dst->u[i] = 0;
   5061       else if (width + offset < 32)
   5062          dst->u[i] = (src0->u[i] << (32 - width - offset)) >> (32 - width);
   5063       else
   5064          dst->u[i] = src0->u[i] >> offset;
   5065    }
   5066 }
   5067 
   5068 /**
   5069  * Bitfield insert: copy low bits from src1 into a region of src0.
   5070  */
   5071 static void
   5072 micro_bfi(union tgsi_exec_channel *dst,
   5073           const union tgsi_exec_channel *src0,
   5074           const union tgsi_exec_channel *src1,
   5075           const union tgsi_exec_channel *src2,
   5076           const union tgsi_exec_channel *src3)
   5077 {
   5078    int i;
   5079    for (i = 0; i < 4; i++) {
   5080       int width = src3->u[i] & 0x1f;
   5081       int offset = src2->u[i] & 0x1f;
   5082       int bitmask = ((1 << width) - 1) << offset;
   5083       dst->u[i] = ((src1->u[i] << offset) & bitmask) | (src0->u[i] & ~bitmask);
   5084    }
   5085 }
   5086 
   5087 static void
   5088 micro_brev(union tgsi_exec_channel *dst,
   5089            const union tgsi_exec_channel *src)
   5090 {
   5091    dst->u[0] = util_bitreverse(src->u[0]);
   5092    dst->u[1] = util_bitreverse(src->u[1]);
   5093    dst->u[2] = util_bitreverse(src->u[2]);
   5094    dst->u[3] = util_bitreverse(src->u[3]);
   5095 }
   5096 
   5097 static void
   5098 micro_popc(union tgsi_exec_channel *dst,
   5099            const union tgsi_exec_channel *src)
   5100 {
   5101    dst->u[0] = util_bitcount(src->u[0]);
   5102    dst->u[1] = util_bitcount(src->u[1]);
   5103    dst->u[2] = util_bitcount(src->u[2]);
   5104    dst->u[3] = util_bitcount(src->u[3]);
   5105 }
   5106 
   5107 static void
   5108 micro_lsb(union tgsi_exec_channel *dst,
   5109           const union tgsi_exec_channel *src)
   5110 {
   5111    dst->i[0] = ffs(src->u[0]) - 1;
   5112    dst->i[1] = ffs(src->u[1]) - 1;
   5113    dst->i[2] = ffs(src->u[2]) - 1;
   5114    dst->i[3] = ffs(src->u[3]) - 1;
   5115 }
   5116 
   5117 static void
   5118 micro_imsb(union tgsi_exec_channel *dst,
   5119            const union tgsi_exec_channel *src)
   5120 {
   5121    dst->i[0] = util_last_bit_signed(src->i[0]) - 1;
   5122    dst->i[1] = util_last_bit_signed(src->i[1]) - 1;
   5123    dst->i[2] = util_last_bit_signed(src->i[2]) - 1;
   5124    dst->i[3] = util_last_bit_signed(src->i[3]) - 1;
   5125 }
   5126 
   5127 static void
   5128 micro_umsb(union tgsi_exec_channel *dst,
   5129            const union tgsi_exec_channel *src)
   5130 {
   5131    dst->i[0] = util_last_bit(src->u[0]) - 1;
   5132    dst->i[1] = util_last_bit(src->u[1]) - 1;
   5133    dst->i[2] = util_last_bit(src->u[2]) - 1;
   5134    dst->i[3] = util_last_bit(src->u[3]) - 1;
   5135 }
   5136 
   5137 /**
   5138  * Execute a TGSI instruction.
   5139  * Returns TRUE if a barrier instruction is hit,
   5140  * otherwise FALSE.
   5141  */
   5142 static boolean
   5143 exec_instruction(
   5144    struct tgsi_exec_machine *mach,
   5145    const struct tgsi_full_instruction *inst,
   5146    int *pc )
   5147 {
   5148    union tgsi_exec_channel r[10];
   5149 
   5150    (*pc)++;
   5151 
   5152    switch (inst->Instruction.Opcode) {
   5153    case TGSI_OPCODE_ARL:
   5154       exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
   5155       break;
   5156 
   5157    case TGSI_OPCODE_MOV:
   5158       exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
   5159       break;
   5160 
   5161    case TGSI_OPCODE_LIT:
   5162       exec_lit(mach, inst);
   5163       break;
   5164 
   5165    case TGSI_OPCODE_RCP:
   5166       exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5167       break;
   5168 
   5169    case TGSI_OPCODE_RSQ:
   5170       exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5171       break;
   5172 
   5173    case TGSI_OPCODE_EXP:
   5174       exec_exp(mach, inst);
   5175       break;
   5176 
   5177    case TGSI_OPCODE_LOG:
   5178       exec_log(mach, inst);
   5179       break;
   5180 
   5181    case TGSI_OPCODE_MUL:
   5182       exec_vector_binary(mach, inst, micro_mul, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5183       break;
   5184 
   5185    case TGSI_OPCODE_ADD:
   5186       exec_vector_binary(mach, inst, micro_add, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5187       break;
   5188 
   5189    case TGSI_OPCODE_DP3:
   5190       exec_dp3(mach, inst);
   5191       break;
   5192 
   5193    case TGSI_OPCODE_DP4:
   5194       exec_dp4(mach, inst);
   5195       break;
   5196 
   5197    case TGSI_OPCODE_DST:
   5198       exec_dst(mach, inst);
   5199       break;
   5200 
   5201    case TGSI_OPCODE_MIN:
   5202       exec_vector_binary(mach, inst, micro_min, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5203       break;
   5204 
   5205    case TGSI_OPCODE_MAX:
   5206       exec_vector_binary(mach, inst, micro_max, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5207       break;
   5208 
   5209    case TGSI_OPCODE_SLT:
   5210       exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5211       break;
   5212 
   5213    case TGSI_OPCODE_SGE:
   5214       exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5215       break;
   5216 
   5217    case TGSI_OPCODE_MAD:
   5218       exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5219       break;
   5220 
   5221    case TGSI_OPCODE_LRP:
   5222       exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5223       break;
   5224 
   5225    case TGSI_OPCODE_SQRT:
   5226       exec_scalar_unary(mach, inst, micro_sqrt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5227       break;
   5228 
   5229    case TGSI_OPCODE_DP2A:
   5230       exec_dp2a(mach, inst);
   5231       break;
   5232 
   5233    case TGSI_OPCODE_FRC:
   5234       exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5235       break;
   5236 
   5237    case TGSI_OPCODE_CLAMP:
   5238       exec_vector_trinary(mach, inst, micro_clamp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5239       break;
   5240 
   5241    case TGSI_OPCODE_FLR:
   5242       exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5243       break;
   5244 
   5245    case TGSI_OPCODE_ROUND:
   5246       exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5247       break;
   5248 
   5249    case TGSI_OPCODE_EX2:
   5250       exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5251       break;
   5252 
   5253    case TGSI_OPCODE_LG2:
   5254       exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5255       break;
   5256 
   5257    case TGSI_OPCODE_POW:
   5258       exec_scalar_binary(mach, inst, micro_pow, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5259       break;
   5260 
   5261    case TGSI_OPCODE_XPD:
   5262       exec_xpd(mach, inst);
   5263       break;
   5264 
   5265    case TGSI_OPCODE_DPH:
   5266       exec_dph(mach, inst);
   5267       break;
   5268 
   5269    case TGSI_OPCODE_COS:
   5270       exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5271       break;
   5272 
   5273    case TGSI_OPCODE_DDX:
   5274       exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5275       break;
   5276 
   5277    case TGSI_OPCODE_DDY:
   5278       exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5279       break;
   5280 
   5281    case TGSI_OPCODE_KILL:
   5282       exec_kill (mach, inst);
   5283       break;
   5284 
   5285    case TGSI_OPCODE_KILL_IF:
   5286       exec_kill_if (mach, inst);
   5287       break;
   5288 
   5289    case TGSI_OPCODE_PK2H:
   5290       exec_pk2h(mach, inst);
   5291       break;
   5292 
   5293    case TGSI_OPCODE_PK2US:
   5294       assert (0);
   5295       break;
   5296 
   5297    case TGSI_OPCODE_PK4B:
   5298       assert (0);
   5299       break;
   5300 
   5301    case TGSI_OPCODE_PK4UB:
   5302       assert (0);
   5303       break;
   5304 
   5305    case TGSI_OPCODE_SEQ:
   5306       exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5307       break;
   5308 
   5309    case TGSI_OPCODE_SGT:
   5310       exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5311       break;
   5312 
   5313    case TGSI_OPCODE_SIN:
   5314       exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5315       break;
   5316 
   5317    case TGSI_OPCODE_SLE:
   5318       exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5319       break;
   5320 
   5321    case TGSI_OPCODE_SNE:
   5322       exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5323       break;
   5324 
   5325    case TGSI_OPCODE_TEX:
   5326       /* simple texture lookup */
   5327       /* src[0] = texcoord */
   5328       /* src[1] = sampler unit */
   5329       exec_tex(mach, inst, TEX_MODIFIER_NONE, 1);
   5330       break;
   5331 
   5332    case TGSI_OPCODE_TXB:
   5333       /* Texture lookup with lod bias */
   5334       /* src[0] = texcoord (src[0].w = LOD bias) */
   5335       /* src[1] = sampler unit */
   5336       exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS, 1);
   5337       break;
   5338 
   5339    case TGSI_OPCODE_TXD:
   5340       /* Texture lookup with explict partial derivatives */
   5341       /* src[0] = texcoord */
   5342       /* src[1] = d[strq]/dx */
   5343       /* src[2] = d[strq]/dy */
   5344       /* src[3] = sampler unit */
   5345       exec_txd(mach, inst);
   5346       break;
   5347 
   5348    case TGSI_OPCODE_TXL:
   5349       /* Texture lookup with explit LOD */
   5350       /* src[0] = texcoord (src[0].w = LOD) */
   5351       /* src[1] = sampler unit */
   5352       exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, 1);
   5353       break;
   5354 
   5355    case TGSI_OPCODE_TXP:
   5356       /* Texture lookup with projection */
   5357       /* src[0] = texcoord (src[0].w = projection) */
   5358       /* src[1] = sampler unit */
   5359       exec_tex(mach, inst, TEX_MODIFIER_PROJECTED, 1);
   5360       break;
   5361 
   5362    case TGSI_OPCODE_TG4:
   5363       /* src[0] = texcoord */
   5364       /* src[1] = component */
   5365       /* src[2] = sampler unit */
   5366       exec_tex(mach, inst, TEX_MODIFIER_GATHER, 2);
   5367       break;
   5368 
   5369    case TGSI_OPCODE_LODQ:
   5370       /* src[0] = texcoord */
   5371       /* src[1] = sampler unit */
   5372       exec_lodq(mach, inst);
   5373       break;
   5374 
   5375    case TGSI_OPCODE_UP2H:
   5376       exec_up2h(mach, inst);
   5377       break;
   5378 
   5379    case TGSI_OPCODE_UP2US:
   5380       assert (0);
   5381       break;
   5382 
   5383    case TGSI_OPCODE_UP4B:
   5384       assert (0);
   5385       break;
   5386 
   5387    case TGSI_OPCODE_UP4UB:
   5388       assert (0);
   5389       break;
   5390 
   5391    case TGSI_OPCODE_ARR:
   5392       exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
   5393       break;
   5394 
   5395    case TGSI_OPCODE_CAL:
   5396       /* skip the call if no execution channels are enabled */
   5397       if (mach->ExecMask) {
   5398          /* do the call */
   5399 
   5400          /* First, record the depths of the execution stacks.
   5401           * This is important for deeply nested/looped return statements.
   5402           * We have to unwind the stacks by the correct amount.  For a
   5403           * real code generator, we could determine the number of entries
   5404           * to pop off each stack with simple static analysis and avoid
   5405           * implementing this data structure at run time.
   5406           */
   5407          mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
   5408          mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
   5409          mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
   5410          mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
   5411          mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
   5412          /* note that PC was already incremented above */
   5413          mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
   5414 
   5415          mach->CallStackTop++;
   5416 
   5417          /* Second, push the Cond, Loop, Cont, Func stacks */
   5418          assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
   5419          assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
   5420          assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
   5421          assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
   5422          assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
   5423          assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
   5424 
   5425          mach->CondStack[mach->CondStackTop++] = mach->CondMask;
   5426          mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
   5427          mach->ContStack[mach->ContStackTop++] = mach->ContMask;
   5428          mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
   5429          mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
   5430          mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
   5431 
   5432          /* Finally, jump to the subroutine.  The label is a pointer
   5433           * (an instruction number) to the BGNSUB instruction.
   5434           */
   5435          *pc = inst->Label.Label;
   5436          assert(mach->Instructions[*pc].Instruction.Opcode
   5437                 == TGSI_OPCODE_BGNSUB);
   5438       }
   5439       break;
   5440 
   5441    case TGSI_OPCODE_RET:
   5442       mach->FuncMask &= ~mach->ExecMask;
   5443       UPDATE_EXEC_MASK(mach);
   5444 
   5445       if (mach->FuncMask == 0x0) {
   5446          /* really return now (otherwise, keep executing */
   5447 
   5448          if (mach->CallStackTop == 0) {
   5449             /* returning from main() */
   5450             mach->CondStackTop = 0;
   5451             mach->LoopStackTop = 0;
   5452             mach->ContStackTop = 0;
   5453             mach->LoopLabelStackTop = 0;
   5454             mach->SwitchStackTop = 0;
   5455             mach->BreakStackTop = 0;
   5456             *pc = -1;
   5457             return FALSE;
   5458          }
   5459 
   5460          assert(mach->CallStackTop > 0);
   5461          mach->CallStackTop--;
   5462 
   5463          mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
   5464          mach->CondMask = mach->CondStack[mach->CondStackTop];
   5465 
   5466          mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
   5467          mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
   5468 
   5469          mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
   5470          mach->ContMask = mach->ContStack[mach->ContStackTop];
   5471 
   5472          mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
   5473          mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
   5474 
   5475          mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
   5476          mach->BreakType = mach->BreakStack[mach->BreakStackTop];
   5477 
   5478          assert(mach->FuncStackTop > 0);
   5479          mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
   5480 
   5481          *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
   5482 
   5483          UPDATE_EXEC_MASK(mach);
   5484       }
   5485       break;
   5486 
   5487    case TGSI_OPCODE_SSG:
   5488       exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5489       break;
   5490 
   5491    case TGSI_OPCODE_CMP:
   5492       exec_vector_trinary(mach, inst, micro_cmp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5493       break;
   5494 
   5495    case TGSI_OPCODE_SCS:
   5496       exec_scs(mach, inst);
   5497       break;
   5498 
   5499    case TGSI_OPCODE_DIV:
   5500       exec_vector_binary(mach, inst, micro_div, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5501       break;
   5502 
   5503    case TGSI_OPCODE_DP2:
   5504       exec_dp2(mach, inst);
   5505       break;
   5506 
   5507    case TGSI_OPCODE_IF:
   5508       /* push CondMask */
   5509       assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
   5510       mach->CondStack[mach->CondStackTop++] = mach->CondMask;
   5511       FETCH( &r[0], 0, TGSI_CHAN_X );
   5512       /* update CondMask */
   5513       if( ! r[0].f[0] ) {
   5514          mach->CondMask &= ~0x1;
   5515       }
   5516       if( ! r[0].f[1] ) {
   5517          mach->CondMask &= ~0x2;
   5518       }
   5519       if( ! r[0].f[2] ) {
   5520          mach->CondMask &= ~0x4;
   5521       }
   5522       if( ! r[0].f[3] ) {
   5523          mach->CondMask &= ~0x8;
   5524       }
   5525       UPDATE_EXEC_MASK(mach);
   5526       /* Todo: If CondMask==0, jump to ELSE */
   5527       break;
   5528 
   5529    case TGSI_OPCODE_UIF:
   5530       /* push CondMask */
   5531       assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
   5532       mach->CondStack[mach->CondStackTop++] = mach->CondMask;
   5533       IFETCH( &r[0], 0, TGSI_CHAN_X );
   5534       /* update CondMask */
   5535       if( ! r[0].u[0] ) {
   5536          mach->CondMask &= ~0x1;
   5537       }
   5538       if( ! r[0].u[1] ) {
   5539          mach->CondMask &= ~0x2;
   5540       }
   5541       if( ! r[0].u[2] ) {
   5542          mach->CondMask &= ~0x4;
   5543       }
   5544       if( ! r[0].u[3] ) {
   5545          mach->CondMask &= ~0x8;
   5546       }
   5547       UPDATE_EXEC_MASK(mach);
   5548       /* Todo: If CondMask==0, jump to ELSE */
   5549       break;
   5550 
   5551    case TGSI_OPCODE_ELSE:
   5552       /* invert CondMask wrt previous mask */
   5553       {
   5554          uint prevMask;
   5555          assert(mach->CondStackTop > 0);
   5556          prevMask = mach->CondStack[mach->CondStackTop - 1];
   5557          mach->CondMask = ~mach->CondMask & prevMask;
   5558          UPDATE_EXEC_MASK(mach);
   5559          /* Todo: If CondMask==0, jump to ENDIF */
   5560       }
   5561       break;
   5562 
   5563    case TGSI_OPCODE_ENDIF:
   5564       /* pop CondMask */
   5565       assert(mach->CondStackTop > 0);
   5566       mach->CondMask = mach->CondStack[--mach->CondStackTop];
   5567       UPDATE_EXEC_MASK(mach);
   5568       break;
   5569 
   5570    case TGSI_OPCODE_END:
   5571       /* make sure we end primitives which haven't
   5572        * been explicitly emitted */
   5573       conditional_emit_primitive(mach);
   5574       /* halt execution */
   5575       *pc = -1;
   5576       break;
   5577 
   5578    case TGSI_OPCODE_PUSHA:
   5579       assert (0);
   5580       break;
   5581 
   5582    case TGSI_OPCODE_POPA:
   5583       assert (0);
   5584       break;
   5585 
   5586    case TGSI_OPCODE_CEIL:
   5587       exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5588       break;
   5589 
   5590    case TGSI_OPCODE_I2F:
   5591       exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
   5592       break;
   5593 
   5594    case TGSI_OPCODE_NOT:
   5595       exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5596       break;
   5597 
   5598    case TGSI_OPCODE_TRUNC:
   5599       exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5600       break;
   5601 
   5602    case TGSI_OPCODE_SHL:
   5603       exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5604       break;
   5605 
   5606    case TGSI_OPCODE_AND:
   5607       exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5608       break;
   5609 
   5610    case TGSI_OPCODE_OR:
   5611       exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5612       break;
   5613 
   5614    case TGSI_OPCODE_MOD:
   5615       exec_vector_binary(mach, inst, micro_mod, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5616       break;
   5617 
   5618    case TGSI_OPCODE_XOR:
   5619       exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5620       break;
   5621 
   5622    case TGSI_OPCODE_SAD:
   5623       assert (0);
   5624       break;
   5625 
   5626    case TGSI_OPCODE_TXF:
   5627       exec_txf(mach, inst);
   5628       break;
   5629 
   5630    case TGSI_OPCODE_TXQ:
   5631       exec_txq(mach, inst);
   5632       break;
   5633 
   5634    case TGSI_OPCODE_EMIT:
   5635       emit_vertex(mach);
   5636       break;
   5637 
   5638    case TGSI_OPCODE_ENDPRIM:
   5639       emit_primitive(mach);
   5640       break;
   5641 
   5642    case TGSI_OPCODE_BGNLOOP:
   5643       /* push LoopMask and ContMasks */
   5644       assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
   5645       assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
   5646       assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
   5647       assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
   5648 
   5649       mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
   5650       mach->ContStack[mach->ContStackTop++] = mach->ContMask;
   5651       mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
   5652       mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
   5653       mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
   5654       break;
   5655 
   5656    case TGSI_OPCODE_ENDLOOP:
   5657       /* Restore ContMask, but don't pop */
   5658       assert(mach->ContStackTop > 0);
   5659       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
   5660       UPDATE_EXEC_MASK(mach);
   5661       if (mach->ExecMask) {
   5662          /* repeat loop: jump to instruction just past BGNLOOP */
   5663          assert(mach->LoopLabelStackTop > 0);
   5664          *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
   5665       }
   5666       else {
   5667          /* exit loop: pop LoopMask */
   5668          assert(mach->LoopStackTop > 0);
   5669          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
   5670          /* pop ContMask */
   5671          assert(mach->ContStackTop > 0);
   5672          mach->ContMask = mach->ContStack[--mach->ContStackTop];
   5673          assert(mach->LoopLabelStackTop > 0);
   5674          --mach->LoopLabelStackTop;
   5675 
   5676          mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
   5677       }
   5678       UPDATE_EXEC_MASK(mach);
   5679       break;
   5680 
   5681    case TGSI_OPCODE_BRK:
   5682       exec_break(mach);
   5683       break;
   5684 
   5685    case TGSI_OPCODE_CONT:
   5686       /* turn off cont channels for each enabled exec channel */
   5687       mach->ContMask &= ~mach->ExecMask;
   5688       /* Todo: if mach->LoopMask == 0, jump to end of loop */
   5689       UPDATE_EXEC_MASK(mach);
   5690       break;
   5691 
   5692    case TGSI_OPCODE_BGNSUB:
   5693       /* no-op */
   5694       break;
   5695 
   5696    case TGSI_OPCODE_ENDSUB:
   5697       /*
   5698        * XXX: This really should be a no-op. We should never reach this opcode.
   5699        */
   5700 
   5701       assert(mach->CallStackTop > 0);
   5702       mach->CallStackTop--;
   5703 
   5704       mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
   5705       mach->CondMask = mach->CondStack[mach->CondStackTop];
   5706 
   5707       mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
   5708       mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
   5709 
   5710       mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
   5711       mach->ContMask = mach->ContStack[mach->ContStackTop];
   5712 
   5713       mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
   5714       mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
   5715 
   5716       mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
   5717       mach->BreakType = mach->BreakStack[mach->BreakStackTop];
   5718 
   5719       assert(mach->FuncStackTop > 0);
   5720       mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
   5721 
   5722       *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
   5723 
   5724       UPDATE_EXEC_MASK(mach);
   5725       break;
   5726 
   5727    case TGSI_OPCODE_NOP:
   5728       break;
   5729 
   5730    case TGSI_OPCODE_BREAKC:
   5731       IFETCH(&r[0], 0, TGSI_CHAN_X);
   5732       /* update CondMask */
   5733       if (r[0].u[0] && (mach->ExecMask & 0x1)) {
   5734          mach->LoopMask &= ~0x1;
   5735       }
   5736       if (r[0].u[1] && (mach->ExecMask & 0x2)) {
   5737          mach->LoopMask &= ~0x2;
   5738       }
   5739       if (r[0].u[2] && (mach->ExecMask & 0x4)) {
   5740          mach->LoopMask &= ~0x4;
   5741       }
   5742       if (r[0].u[3] && (mach->ExecMask & 0x8)) {
   5743          mach->LoopMask &= ~0x8;
   5744       }
   5745       /* Todo: if mach->LoopMask == 0, jump to end of loop */
   5746       UPDATE_EXEC_MASK(mach);
   5747       break;
   5748 
   5749    case TGSI_OPCODE_F2I:
   5750       exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
   5751       break;
   5752 
   5753    case TGSI_OPCODE_FSEQ:
   5754       exec_vector_binary(mach, inst, micro_fseq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
   5755       break;
   5756 
   5757    case TGSI_OPCODE_FSGE:
   5758       exec_vector_binary(mach, inst, micro_fsge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
   5759       break;
   5760 
   5761    case TGSI_OPCODE_FSLT:
   5762       exec_vector_binary(mach, inst, micro_fslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
   5763       break;
   5764 
   5765    case TGSI_OPCODE_FSNE:
   5766       exec_vector_binary(mach, inst, micro_fsne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
   5767       break;
   5768 
   5769    case TGSI_OPCODE_IDIV:
   5770       exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5771       break;
   5772 
   5773    case TGSI_OPCODE_IMAX:
   5774       exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5775       break;
   5776 
   5777    case TGSI_OPCODE_IMIN:
   5778       exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5779       break;
   5780 
   5781    case TGSI_OPCODE_INEG:
   5782       exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5783       break;
   5784 
   5785    case TGSI_OPCODE_ISGE:
   5786       exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5787       break;
   5788 
   5789    case TGSI_OPCODE_ISHR:
   5790       exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5791       break;
   5792 
   5793    case TGSI_OPCODE_ISLT:
   5794       exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5795       break;
   5796 
   5797    case TGSI_OPCODE_F2U:
   5798       exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
   5799       break;
   5800 
   5801    case TGSI_OPCODE_U2F:
   5802       exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
   5803       break;
   5804 
   5805    case TGSI_OPCODE_UADD:
   5806       exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5807       break;
   5808 
   5809    case TGSI_OPCODE_UDIV:
   5810       exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5811       break;
   5812 
   5813    case TGSI_OPCODE_UMAD:
   5814       exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5815       break;
   5816 
   5817    case TGSI_OPCODE_UMAX:
   5818       exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5819       break;
   5820 
   5821    case TGSI_OPCODE_UMIN:
   5822       exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5823       break;
   5824 
   5825    case TGSI_OPCODE_UMOD:
   5826       exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5827       break;
   5828 
   5829    case TGSI_OPCODE_UMUL:
   5830       exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5831       break;
   5832 
   5833    case TGSI_OPCODE_IMUL_HI:
   5834       exec_vector_binary(mach, inst, micro_imul_hi, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5835       break;
   5836 
   5837    case TGSI_OPCODE_UMUL_HI:
   5838       exec_vector_binary(mach, inst, micro_umul_hi, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5839       break;
   5840 
   5841    case TGSI_OPCODE_USEQ:
   5842       exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5843       break;
   5844 
   5845    case TGSI_OPCODE_USGE:
   5846       exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5847       break;
   5848 
   5849    case TGSI_OPCODE_USHR:
   5850       exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5851       break;
   5852 
   5853    case TGSI_OPCODE_USLT:
   5854       exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5855       break;
   5856 
   5857    case TGSI_OPCODE_USNE:
   5858       exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5859       break;
   5860 
   5861    case TGSI_OPCODE_SWITCH:
   5862       exec_switch(mach, inst);
   5863       break;
   5864 
   5865    case TGSI_OPCODE_CASE:
   5866       exec_case(mach, inst);
   5867       break;
   5868 
   5869    case TGSI_OPCODE_DEFAULT:
   5870       exec_default(mach);
   5871       break;
   5872 
   5873    case TGSI_OPCODE_ENDSWITCH:
   5874       exec_endswitch(mach);
   5875       break;
   5876 
   5877    case TGSI_OPCODE_SAMPLE_I:
   5878       exec_txf(mach, inst);
   5879       break;
   5880 
   5881    case TGSI_OPCODE_SAMPLE_I_MS:
   5882       exec_txf(mach, inst);
   5883       break;
   5884 
   5885    case TGSI_OPCODE_SAMPLE:
   5886       exec_sample(mach, inst, TEX_MODIFIER_NONE, FALSE);
   5887       break;
   5888 
   5889    case TGSI_OPCODE_SAMPLE_B:
   5890       exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS, FALSE);
   5891       break;
   5892 
   5893    case TGSI_OPCODE_SAMPLE_C:
   5894       exec_sample(mach, inst, TEX_MODIFIER_NONE, TRUE);
   5895       break;
   5896 
   5897    case TGSI_OPCODE_SAMPLE_C_LZ:
   5898       exec_sample(mach, inst, TEX_MODIFIER_LEVEL_ZERO, TRUE);
   5899       break;
   5900 
   5901    case TGSI_OPCODE_SAMPLE_D:
   5902       exec_sample_d(mach, inst);
   5903       break;
   5904 
   5905    case TGSI_OPCODE_SAMPLE_L:
   5906       exec_sample(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, FALSE);
   5907       break;
   5908 
   5909    case TGSI_OPCODE_GATHER4:
   5910       assert(0);
   5911       break;
   5912 
   5913    case TGSI_OPCODE_SVIEWINFO:
   5914       exec_txq(mach, inst);
   5915       break;
   5916 
   5917    case TGSI_OPCODE_SAMPLE_POS:
   5918       assert(0);
   5919       break;
   5920 
   5921    case TGSI_OPCODE_SAMPLE_INFO:
   5922       assert(0);
   5923       break;
   5924 
   5925    case TGSI_OPCODE_UARL:
   5926       exec_vector_unary(mach, inst, micro_uarl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
   5927       break;
   5928 
   5929    case TGSI_OPCODE_UCMP:
   5930       exec_vector_trinary(mach, inst, micro_ucmp, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5931       break;
   5932 
   5933    case TGSI_OPCODE_IABS:
   5934       exec_vector_unary(mach, inst, micro_iabs, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5935       break;
   5936 
   5937    case TGSI_OPCODE_ISSG:
   5938       exec_vector_unary(mach, inst, micro_isgn, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5939       break;
   5940 
   5941    case TGSI_OPCODE_TEX2:
   5942       /* simple texture lookup */
   5943       /* src[0] = texcoord */
   5944       /* src[1] = compare */
   5945       /* src[2] = sampler unit */
   5946       exec_tex(mach, inst, TEX_MODIFIER_NONE, 2);
   5947       break;
   5948    case TGSI_OPCODE_TXB2:
   5949       /* simple texture lookup */
   5950       /* src[0] = texcoord */
   5951       /* src[1] = bias */
   5952       /* src[2] = sampler unit */
   5953       exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS, 2);
   5954       break;
   5955    case TGSI_OPCODE_TXL2:
   5956       /* simple texture lookup */
   5957       /* src[0] = texcoord */
   5958       /* src[1] = lod */
   5959       /* src[2] = sampler unit */
   5960       exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, 2);
   5961       break;
   5962 
   5963    case TGSI_OPCODE_IBFE:
   5964       exec_vector_trinary(mach, inst, micro_ibfe, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5965       break;
   5966    case TGSI_OPCODE_UBFE:
   5967       exec_vector_trinary(mach, inst, micro_ubfe, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5968       break;
   5969    case TGSI_OPCODE_BFI:
   5970       exec_vector_quaternary(mach, inst, micro_bfi, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5971       break;
   5972    case TGSI_OPCODE_BREV:
   5973       exec_vector_unary(mach, inst, micro_brev, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5974       break;
   5975    case TGSI_OPCODE_POPC:
   5976       exec_vector_unary(mach, inst, micro_popc, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5977       break;
   5978    case TGSI_OPCODE_LSB:
   5979       exec_vector_unary(mach, inst, micro_lsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
   5980       break;
   5981    case TGSI_OPCODE_IMSB:
   5982       exec_vector_unary(mach, inst, micro_imsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5983       break;
   5984    case TGSI_OPCODE_UMSB:
   5985       exec_vector_unary(mach, inst, micro_umsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
   5986       break;
   5987 
   5988    case TGSI_OPCODE_F2D:
   5989       exec_t_2_64(mach, inst, micro_f2d, TGSI_EXEC_DATA_FLOAT);
   5990       break;
   5991 
   5992    case TGSI_OPCODE_D2F:
   5993       exec_64_2_t(mach, inst, micro_d2f, TGSI_EXEC_DATA_FLOAT);
   5994       break;
   5995 
   5996    case TGSI_OPCODE_DABS:
   5997       exec_double_unary(mach, inst, micro_dabs);
   5998       break;
   5999 
   6000    case TGSI_OPCODE_DNEG:
   6001       exec_double_unary(mach, inst, micro_dneg);
   6002       break;
   6003 
   6004    case TGSI_OPCODE_DADD:
   6005       exec_double_binary(mach, inst, micro_dadd, TGSI_EXEC_DATA_DOUBLE);
   6006       break;
   6007 
   6008    case TGSI_OPCODE_DDIV:
   6009       exec_double_binary(mach, inst, micro_ddiv, TGSI_EXEC_DATA_DOUBLE);
   6010       break;
   6011 
   6012    case TGSI_OPCODE_DMUL:
   6013       exec_double_binary(mach, inst, micro_dmul, TGSI_EXEC_DATA_DOUBLE);
   6014       break;
   6015 
   6016    case TGSI_OPCODE_DMAX:
   6017       exec_double_binary(mach, inst, micro_dmax, TGSI_EXEC_DATA_DOUBLE);
   6018       break;
   6019 
   6020    case TGSI_OPCODE_DMIN:
   6021       exec_double_binary(mach, inst, micro_dmin, TGSI_EXEC_DATA_DOUBLE);
   6022       break;
   6023 
   6024    case TGSI_OPCODE_DSLT:
   6025       exec_double_binary(mach, inst, micro_dslt, TGSI_EXEC_DATA_UINT);
   6026       break;
   6027 
   6028    case TGSI_OPCODE_DSGE:
   6029       exec_double_binary(mach, inst, micro_dsge, TGSI_EXEC_DATA_UINT);
   6030       break;
   6031 
   6032    case TGSI_OPCODE_DSEQ:
   6033       exec_double_binary(mach, inst, micro_dseq, TGSI_EXEC_DATA_UINT);
   6034       break;
   6035 
   6036    case TGSI_OPCODE_DSNE:
   6037       exec_double_binary(mach, inst, micro_dsne, TGSI_EXEC_DATA_UINT);
   6038       break;
   6039 
   6040    case TGSI_OPCODE_DRCP:
   6041       exec_double_unary(mach, inst, micro_drcp);
   6042       break;
   6043 
   6044    case TGSI_OPCODE_DSQRT:
   6045       exec_double_unary(mach, inst, micro_dsqrt);
   6046       break;
   6047 
   6048    case TGSI_OPCODE_DRSQ:
   6049       exec_double_unary(mach, inst, micro_drsq);
   6050       break;
   6051 
   6052    case TGSI_OPCODE_DMAD:
   6053       exec_double_trinary(mach, inst, micro_dmad);
   6054       break;
   6055 
   6056    case TGSI_OPCODE_DFRAC:
   6057       exec_double_unary(mach, inst, micro_dfrac);
   6058       break;
   6059 
   6060    case TGSI_OPCODE_DLDEXP:
   6061       exec_dldexp(mach, inst);
   6062       break;
   6063 
   6064    case TGSI_OPCODE_DFRACEXP:
   6065       exec_dfracexp(mach, inst);
   6066       break;
   6067 
   6068    case TGSI_OPCODE_I2D:
   6069       exec_t_2_64(mach, inst, micro_i2d, TGSI_EXEC_DATA_INT);
   6070       break;
   6071 
   6072    case TGSI_OPCODE_D2I:
   6073       exec_64_2_t(mach, inst, micro_d2i, TGSI_EXEC_DATA_INT);
   6074       break;
   6075 
   6076    case TGSI_OPCODE_U2D:
   6077       exec_t_2_64(mach, inst, micro_u2d, TGSI_EXEC_DATA_UINT);
   6078       break;
   6079 
   6080    case TGSI_OPCODE_D2U:
   6081       exec_64_2_t(mach, inst, micro_d2u, TGSI_EXEC_DATA_INT);
   6082       break;
   6083 
   6084    case TGSI_OPCODE_LOAD:
   6085       exec_load(mach, inst);
   6086       break;
   6087 
   6088    case TGSI_OPCODE_STORE:
   6089       exec_store(mach, inst);
   6090       break;
   6091 
   6092    case TGSI_OPCODE_ATOMUADD:
   6093    case TGSI_OPCODE_ATOMXCHG:
   6094    case TGSI_OPCODE_ATOMCAS:
   6095    case TGSI_OPCODE_ATOMAND:
   6096    case TGSI_OPCODE_ATOMOR:
   6097    case TGSI_OPCODE_ATOMXOR:
   6098    case TGSI_OPCODE_ATOMUMIN:
   6099    case TGSI_OPCODE_ATOMUMAX:
   6100    case TGSI_OPCODE_ATOMIMIN:
   6101    case TGSI_OPCODE_ATOMIMAX:
   6102       exec_atomop(mach, inst);
   6103       break;
   6104 
   6105    case TGSI_OPCODE_RESQ:
   6106       exec_resq(mach, inst);
   6107       break;
   6108    case TGSI_OPCODE_BARRIER:
   6109    case TGSI_OPCODE_MEMBAR:
   6110       return TRUE;
   6111       break;
   6112 
   6113    case TGSI_OPCODE_I64ABS:
   6114       exec_double_unary(mach, inst, micro_i64abs);
   6115       break;
   6116 
   6117    case TGSI_OPCODE_I64SSG:
   6118       exec_double_unary(mach, inst, micro_i64sgn);
   6119       break;
   6120 
   6121    case TGSI_OPCODE_I64NEG:
   6122       exec_double_unary(mach, inst, micro_i64neg);
   6123       break;
   6124 
   6125    case TGSI_OPCODE_U64SEQ:
   6126       exec_double_binary(mach, inst, micro_u64seq, TGSI_EXEC_DATA_UINT);
   6127       break;
   6128 
   6129    case TGSI_OPCODE_U64SNE:
   6130       exec_double_binary(mach, inst, micro_u64sne, TGSI_EXEC_DATA_UINT);
   6131       break;
   6132 
   6133    case TGSI_OPCODE_I64SLT:
   6134       exec_double_binary(mach, inst, micro_i64slt, TGSI_EXEC_DATA_UINT);
   6135       break;
   6136    case TGSI_OPCODE_U64SLT:
   6137       exec_double_binary(mach, inst, micro_u64slt, TGSI_EXEC_DATA_UINT);
   6138       break;
   6139 
   6140    case TGSI_OPCODE_I64SGE:
   6141       exec_double_binary(mach, inst, micro_i64sge, TGSI_EXEC_DATA_UINT);
   6142       break;
   6143    case TGSI_OPCODE_U64SGE:
   6144       exec_double_binary(mach, inst, micro_u64sge, TGSI_EXEC_DATA_UINT);
   6145       break;
   6146 
   6147    case TGSI_OPCODE_I64MIN:
   6148       exec_double_binary(mach, inst, micro_i64min, TGSI_EXEC_DATA_INT64);
   6149       break;
   6150    case TGSI_OPCODE_U64MIN:
   6151       exec_double_binary(mach, inst, micro_u64min, TGSI_EXEC_DATA_UINT64);
   6152       break;
   6153    case TGSI_OPCODE_I64MAX:
   6154       exec_double_binary(mach, inst, micro_i64max, TGSI_EXEC_DATA_INT64);
   6155       break;
   6156    case TGSI_OPCODE_U64MAX:
   6157       exec_double_binary(mach, inst, micro_u64max, TGSI_EXEC_DATA_UINT64);
   6158       break;
   6159    case TGSI_OPCODE_U64ADD:
   6160       exec_double_binary(mach, inst, micro_u64add, TGSI_EXEC_DATA_UINT64);
   6161       break;
   6162    case TGSI_OPCODE_U64MUL:
   6163       exec_double_binary(mach, inst, micro_u64mul, TGSI_EXEC_DATA_UINT64);
   6164       break;
   6165    case TGSI_OPCODE_U64SHL:
   6166       exec_arg0_64_arg1_32(mach, inst, micro_u64shl);
   6167       break;
   6168    case TGSI_OPCODE_I64SHR:
   6169       exec_arg0_64_arg1_32(mach, inst, micro_i64shr);
   6170       break;
   6171    case TGSI_OPCODE_U64SHR:
   6172       exec_arg0_64_arg1_32(mach, inst, micro_u64shr);
   6173       break;
   6174    case TGSI_OPCODE_U64DIV:
   6175       exec_double_binary(mach, inst, micro_u64div, TGSI_EXEC_DATA_UINT64);
   6176       break;
   6177    case TGSI_OPCODE_I64DIV:
   6178       exec_double_binary(mach, inst, micro_i64div, TGSI_EXEC_DATA_INT64);
   6179       break;
   6180    case TGSI_OPCODE_U64MOD:
   6181       exec_double_binary(mach, inst, micro_u64mod, TGSI_EXEC_DATA_UINT64);
   6182       break;
   6183    case TGSI_OPCODE_I64MOD:
   6184       exec_double_binary(mach, inst, micro_i64mod, TGSI_EXEC_DATA_INT64);
   6185       break;
   6186 
   6187    case TGSI_OPCODE_F2U64:
   6188       exec_t_2_64(mach, inst, micro_f2u64, TGSI_EXEC_DATA_FLOAT);
   6189       break;
   6190 
   6191    case TGSI_OPCODE_F2I64:
   6192       exec_t_2_64(mach, inst, micro_f2i64, TGSI_EXEC_DATA_FLOAT);
   6193       break;
   6194 
   6195    case TGSI_OPCODE_U2I64:
   6196       exec_t_2_64(mach, inst, micro_u2i64, TGSI_EXEC_DATA_INT);
   6197       break;
   6198    case TGSI_OPCODE_I2I64:
   6199       exec_t_2_64(mach, inst, micro_i2i64, TGSI_EXEC_DATA_INT);
   6200       break;
   6201 
   6202    case TGSI_OPCODE_D2U64:
   6203       exec_double_unary(mach, inst, micro_d2u64);
   6204       break;
   6205 
   6206    case TGSI_OPCODE_D2I64:
   6207       exec_double_unary(mach, inst, micro_d2i64);
   6208       break;
   6209 
   6210    case TGSI_OPCODE_U642F:
   6211       exec_64_2_t(mach, inst, micro_u642f, TGSI_EXEC_DATA_FLOAT);
   6212       break;
   6213    case TGSI_OPCODE_I642F:
   6214       exec_64_2_t(mach, inst, micro_i642f, TGSI_EXEC_DATA_FLOAT);
   6215       break;
   6216 
   6217    case TGSI_OPCODE_U642D:
   6218       exec_double_unary(mach, inst, micro_u642d);
   6219       break;
   6220    case TGSI_OPCODE_I642D:
   6221       exec_double_unary(mach, inst, micro_i642d);
   6222       break;
   6223 
   6224    default:
   6225       assert( 0 );
   6226    }
   6227    return FALSE;
   6228 }
   6229 
   6230 static void
   6231 tgsi_exec_machine_setup_masks(struct tgsi_exec_machine *mach)
   6232 {
   6233    uint default_mask = 0xf;
   6234 
   6235    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
   6236    mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
   6237 
   6238    if (mach->ShaderType == PIPE_SHADER_GEOMETRY) {
   6239       mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
   6240       mach->Primitives[0] = 0;
   6241       /* GS runs on a single primitive for now */
   6242       default_mask = 0x1;
   6243    }
   6244 
   6245    if (mach->NonHelperMask == 0)
   6246       mach->NonHelperMask = default_mask;
   6247    mach->CondMask = default_mask;
   6248    mach->LoopMask = default_mask;
   6249    mach->ContMask = default_mask;
   6250    mach->FuncMask = default_mask;
   6251    mach->ExecMask = default_mask;
   6252 
   6253    mach->Switch.mask = default_mask;
   6254 
   6255    assert(mach->CondStackTop == 0);
   6256    assert(mach->LoopStackTop == 0);
   6257    assert(mach->ContStackTop == 0);
   6258    assert(mach->SwitchStackTop == 0);
   6259    assert(mach->BreakStackTop == 0);
   6260    assert(mach->CallStackTop == 0);
   6261 }
   6262 
   6263 /**
   6264  * Run TGSI interpreter.
   6265  * \return bitmask of "alive" quad components
   6266  */
   6267 uint
   6268 tgsi_exec_machine_run( struct tgsi_exec_machine *mach, int start_pc )
   6269 {
   6270    uint i;
   6271 
   6272    mach->pc = start_pc;
   6273 
   6274    if (!start_pc) {
   6275       tgsi_exec_machine_setup_masks(mach);
   6276 
   6277       /* execute declarations (interpolants) */
   6278       for (i = 0; i < mach->NumDeclarations; i++) {
   6279          exec_declaration( mach, mach->Declarations+i );
   6280       }
   6281    }
   6282 
   6283    {
   6284 #if DEBUG_EXECUTION
   6285       struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
   6286       struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
   6287       uint inst = 1;
   6288 
   6289       if (!start_pc) {
   6290          memset(mach->Temps, 0, sizeof(temps));
   6291          if (mach->Outputs)
   6292             memset(mach->Outputs, 0, sizeof(outputs));
   6293          memset(temps, 0, sizeof(temps));
   6294          memset(outputs, 0, sizeof(outputs));
   6295       }
   6296 #endif
   6297 
   6298       /* execute instructions, until pc is set to -1 */
   6299       while (mach->pc != -1) {
   6300          boolean barrier_hit;
   6301 #if DEBUG_EXECUTION
   6302          uint i;
   6303 
   6304          tgsi_dump_instruction(&mach->Instructions[mach->pc], inst++);
   6305 #endif
   6306 
   6307          assert(mach->pc < (int) mach->NumInstructions);
   6308          barrier_hit = exec_instruction(mach, mach->Instructions + mach->pc, &mach->pc);
   6309 
   6310          /* for compute shaders if we hit a barrier return now for later rescheduling */
   6311          if (barrier_hit && mach->ShaderType == PIPE_SHADER_COMPUTE)
   6312             return 0;
   6313 
   6314 #if DEBUG_EXECUTION
   6315          for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
   6316             if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
   6317                uint j;
   6318 
   6319                memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
   6320                debug_printf("TEMP[%2u] = ", i);
   6321                for (j = 0; j < 4; j++) {
   6322                   if (j > 0) {
   6323                      debug_printf("           ");
   6324                   }
   6325                   debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
   6326                                temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
   6327                                temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
   6328                                temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
   6329                                temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
   6330                }
   6331             }
   6332          }
   6333          if (mach->Outputs) {
   6334             for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
   6335                if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
   6336                   uint j;
   6337 
   6338                   memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
   6339                   debug_printf("OUT[%2u] =  ", i);
   6340                   for (j = 0; j < 4; j++) {
   6341                      if (j > 0) {
   6342                         debug_printf("           ");
   6343                      }
   6344                      debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
   6345                                   outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
   6346                                   outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
   6347                                   outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
   6348                                   outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
   6349                   }
   6350                }
   6351             }
   6352          }
   6353 #endif
   6354       }
   6355    }
   6356 
   6357 #if 0
   6358    /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
   6359    if (mach->ShaderType == PIPE_SHADER_FRAGMENT) {
   6360       /*
   6361        * Scale back depth component.
   6362        */
   6363       for (i = 0; i < 4; i++)
   6364          mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
   6365    }
   6366 #endif
   6367 
   6368    /* Strictly speaking, these assertions aren't really needed but they
   6369     * can potentially catch some bugs in the control flow code.
   6370     */
   6371    assert(mach->CondStackTop == 0);
   6372    assert(mach->LoopStackTop == 0);
   6373    assert(mach->ContStackTop == 0);
   6374    assert(mach->SwitchStackTop == 0);
   6375    assert(mach->BreakStackTop == 0);
   6376    assert(mach->CallStackTop == 0);
   6377 
   6378    return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
   6379 }
   6380