Home | History | Annotate | Download | only in tgsi
      1 /**************************************************************************
      2  *
      3  * Copyright 2007-2008 VMware, Inc.
      4  * All Rights Reserved.
      5  * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
      6  *
      7  * Permission is hereby granted, free of charge, to any person obtaining a
      8  * copy of this software and associated documentation files (the
      9  * "Software"), to deal in the Software without restriction, including
     10  * without limitation the rights to use, copy, modify, merge, publish,
     11  * distribute, sub license, and/or sell copies of the Software, and to
     12  * permit persons to whom the Software is furnished to do so, subject to
     13  * the following conditions:
     14  *
     15  * The above copyright notice and this permission notice (including the
     16  * next paragraph) shall be included in all copies or substantial portions
     17  * of the Software.
     18  *
     19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
     22  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
     23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     26  *
     27  **************************************************************************/
     28 
     29 /**
     30  * TGSI interpreter/executor.
     31  *
     32  * Flow control information:
     33  *
     34  * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
     35  * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
     36  * care since a condition may be true for some quad components but false
     37  * for other components.
     38  *
     39  * We basically execute all statements (even if they're in the part of
     40  * an IF/ELSE clause that's "not taken") and use a special mask to
     41  * control writing to destination registers.  This is the ExecMask.
     42  * See store_dest().
     43  *
     44  * The ExecMask is computed from three other masks (CondMask, LoopMask and
     45  * ContMask) which are controlled by the flow control instructions (namely:
     46  * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
     47  *
     48  *
     49  * Authors:
     50  *   Michal Krol
     51  *   Brian Paul
     52  */
     53 
     54 #include "pipe/p_compiler.h"
     55 #include "pipe/p_state.h"
     56 #include "pipe/p_shader_tokens.h"
     57 #include "tgsi/tgsi_dump.h"
     58 #include "tgsi/tgsi_parse.h"
     59 #include "tgsi/tgsi_util.h"
     60 #include "tgsi_exec.h"
     61 #include "util/u_half.h"
     62 #include "util/u_memory.h"
     63 #include "util/u_math.h"
     64 #include "util/rounding.h"
     65 
     66 
     67 #define DEBUG_EXECUTION 0
     68 
     69 
     70 #define FAST_MATH 0
     71 
     72 #define TILE_TOP_LEFT     0
     73 #define TILE_TOP_RIGHT    1
     74 #define TILE_BOTTOM_LEFT  2
     75 #define TILE_BOTTOM_RIGHT 3
     76 
     77 union tgsi_double_channel {
     78    double d[TGSI_QUAD_SIZE];
     79    unsigned u[TGSI_QUAD_SIZE][2];
     80    uint64_t u64[TGSI_QUAD_SIZE];
     81    int64_t i64[TGSI_QUAD_SIZE];
     82 };
     83 
     84 struct tgsi_double_vector {
     85    union tgsi_double_channel xy;
     86    union tgsi_double_channel zw;
     87 };
     88 
     89 static void
     90 micro_abs(union tgsi_exec_channel *dst,
     91           const union tgsi_exec_channel *src)
     92 {
     93    dst->f[0] = fabsf(src->f[0]);
     94    dst->f[1] = fabsf(src->f[1]);
     95    dst->f[2] = fabsf(src->f[2]);
     96    dst->f[3] = fabsf(src->f[3]);
     97 }
     98 
     99 static void
    100 micro_arl(union tgsi_exec_channel *dst,
    101           const union tgsi_exec_channel *src)
    102 {
    103    dst->i[0] = (int)floorf(src->f[0]);
    104    dst->i[1] = (int)floorf(src->f[1]);
    105    dst->i[2] = (int)floorf(src->f[2]);
    106    dst->i[3] = (int)floorf(src->f[3]);
    107 }
    108 
    109 static void
    110 micro_arr(union tgsi_exec_channel *dst,
    111           const union tgsi_exec_channel *src)
    112 {
    113    dst->i[0] = (int)floorf(src->f[0] + 0.5f);
    114    dst->i[1] = (int)floorf(src->f[1] + 0.5f);
    115    dst->i[2] = (int)floorf(src->f[2] + 0.5f);
    116    dst->i[3] = (int)floorf(src->f[3] + 0.5f);
    117 }
    118 
    119 static void
    120 micro_ceil(union tgsi_exec_channel *dst,
    121            const union tgsi_exec_channel *src)
    122 {
    123    dst->f[0] = ceilf(src->f[0]);
    124    dst->f[1] = ceilf(src->f[1]);
    125    dst->f[2] = ceilf(src->f[2]);
    126    dst->f[3] = ceilf(src->f[3]);
    127 }
    128 
    129 static void
    130 micro_cmp(union tgsi_exec_channel *dst,
    131           const union tgsi_exec_channel *src0,
    132           const union tgsi_exec_channel *src1,
    133           const union tgsi_exec_channel *src2)
    134 {
    135    dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
    136    dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
    137    dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
    138    dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
    139 }
    140 
    141 static void
    142 micro_cos(union tgsi_exec_channel *dst,
    143           const union tgsi_exec_channel *src)
    144 {
    145    dst->f[0] = cosf(src->f[0]);
    146    dst->f[1] = cosf(src->f[1]);
    147    dst->f[2] = cosf(src->f[2]);
    148    dst->f[3] = cosf(src->f[3]);
    149 }
    150 
    151 static void
    152 micro_d2f(union tgsi_exec_channel *dst,
    153           const union tgsi_double_channel *src)
    154 {
    155    dst->f[0] = (float)src->d[0];
    156    dst->f[1] = (float)src->d[1];
    157    dst->f[2] = (float)src->d[2];
    158    dst->f[3] = (float)src->d[3];
    159 }
    160 
    161 static void
    162 micro_d2i(union tgsi_exec_channel *dst,
    163           const union tgsi_double_channel *src)
    164 {
    165    dst->i[0] = (int)src->d[0];
    166    dst->i[1] = (int)src->d[1];
    167    dst->i[2] = (int)src->d[2];
    168    dst->i[3] = (int)src->d[3];
    169 }
    170 
    171 static void
    172 micro_d2u(union tgsi_exec_channel *dst,
    173           const union tgsi_double_channel *src)
    174 {
    175    dst->u[0] = (unsigned)src->d[0];
    176    dst->u[1] = (unsigned)src->d[1];
    177    dst->u[2] = (unsigned)src->d[2];
    178    dst->u[3] = (unsigned)src->d[3];
    179 }
    180 static void
    181 micro_dabs(union tgsi_double_channel *dst,
    182            const union tgsi_double_channel *src)
    183 {
    184    dst->d[0] = src->d[0] >= 0.0 ? src->d[0] : -src->d[0];
    185    dst->d[1] = src->d[1] >= 0.0 ? src->d[1] : -src->d[1];
    186    dst->d[2] = src->d[2] >= 0.0 ? src->d[2] : -src->d[2];
    187    dst->d[3] = src->d[3] >= 0.0 ? src->d[3] : -src->d[3];
    188 }
    189 
    190 static void
    191 micro_dadd(union tgsi_double_channel *dst,
    192           const union tgsi_double_channel *src)
    193 {
    194    dst->d[0] = src[0].d[0] + src[1].d[0];
    195    dst->d[1] = src[0].d[1] + src[1].d[1];
    196    dst->d[2] = src[0].d[2] + src[1].d[2];
    197    dst->d[3] = src[0].d[3] + src[1].d[3];
    198 }
    199 
    200 static void
    201 micro_ddiv(union tgsi_double_channel *dst,
    202           const union tgsi_double_channel *src)
    203 {
    204    dst->d[0] = src[0].d[0] / src[1].d[0];
    205    dst->d[1] = src[0].d[1] / src[1].d[1];
    206    dst->d[2] = src[0].d[2] / src[1].d[2];
    207    dst->d[3] = src[0].d[3] / src[1].d[3];
    208 }
    209 
    210 static void
    211 micro_ddx(union tgsi_exec_channel *dst,
    212           const union tgsi_exec_channel *src)
    213 {
    214    dst->f[0] =
    215    dst->f[1] =
    216    dst->f[2] =
    217    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
    218 }
    219 
    220 static void
    221 micro_ddy(union tgsi_exec_channel *dst,
    222           const union tgsi_exec_channel *src)
    223 {
    224    dst->f[0] =
    225    dst->f[1] =
    226    dst->f[2] =
    227    dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
    228 }
    229 
    230 static void
    231 micro_dmul(union tgsi_double_channel *dst,
    232            const union tgsi_double_channel *src)
    233 {
    234    dst->d[0] = src[0].d[0] * src[1].d[0];
    235    dst->d[1] = src[0].d[1] * src[1].d[1];
    236    dst->d[2] = src[0].d[2] * src[1].d[2];
    237    dst->d[3] = src[0].d[3] * src[1].d[3];
    238 }
    239 
    240 static void
    241 micro_dmax(union tgsi_double_channel *dst,
    242            const union tgsi_double_channel *src)
    243 {
    244    dst->d[0] = src[0].d[0] > src[1].d[0] ? src[0].d[0] : src[1].d[0];
    245    dst->d[1] = src[0].d[1] > src[1].d[1] ? src[0].d[1] : src[1].d[1];
    246    dst->d[2] = src[0].d[2] > src[1].d[2] ? src[0].d[2] : src[1].d[2];
    247    dst->d[3] = src[0].d[3] > src[1].d[3] ? src[0].d[3] : src[1].d[3];
    248 }
    249 
    250 static void
    251 micro_dmin(union tgsi_double_channel *dst,
    252            const union tgsi_double_channel *src)
    253 {
    254    dst->d[0] = src[0].d[0] < src[1].d[0] ? src[0].d[0] : src[1].d[0];
    255    dst->d[1] = src[0].d[1] < src[1].d[1] ? src[0].d[1] : src[1].d[1];
    256    dst->d[2] = src[0].d[2] < src[1].d[2] ? src[0].d[2] : src[1].d[2];
    257    dst->d[3] = src[0].d[3] < src[1].d[3] ? src[0].d[3] : src[1].d[3];
    258 }
    259 
    260 static void
    261 micro_dneg(union tgsi_double_channel *dst,
    262            const union tgsi_double_channel *src)
    263 {
    264    dst->d[0] = -src->d[0];
    265    dst->d[1] = -src->d[1];
    266    dst->d[2] = -src->d[2];
    267    dst->d[3] = -src->d[3];
    268 }
    269 
    270 static void
    271 micro_dslt(union tgsi_double_channel *dst,
    272            const union tgsi_double_channel *src)
    273 {
    274    dst->u[0][0] = src[0].d[0] < src[1].d[0] ? ~0U : 0U;
    275    dst->u[1][0] = src[0].d[1] < src[1].d[1] ? ~0U : 0U;
    276    dst->u[2][0] = src[0].d[2] < src[1].d[2] ? ~0U : 0U;
    277    dst->u[3][0] = src[0].d[3] < src[1].d[3] ? ~0U : 0U;
    278 }
    279 
    280 static void
    281 micro_dsne(union tgsi_double_channel *dst,
    282            const union tgsi_double_channel *src)
    283 {
    284    dst->u[0][0] = src[0].d[0] != src[1].d[0] ? ~0U : 0U;
    285    dst->u[1][0] = src[0].d[1] != src[1].d[1] ? ~0U : 0U;
    286    dst->u[2][0] = src[0].d[2] != src[1].d[2] ? ~0U : 0U;
    287    dst->u[3][0] = src[0].d[3] != src[1].d[3] ? ~0U : 0U;
    288 }
    289 
    290 static void
    291 micro_dsge(union tgsi_double_channel *dst,
    292            const union tgsi_double_channel *src)
    293 {
    294    dst->u[0][0] = src[0].d[0] >= src[1].d[0] ? ~0U : 0U;
    295    dst->u[1][0] = src[0].d[1] >= src[1].d[1] ? ~0U : 0U;
    296    dst->u[2][0] = src[0].d[2] >= src[1].d[2] ? ~0U : 0U;
    297    dst->u[3][0] = src[0].d[3] >= src[1].d[3] ? ~0U : 0U;
    298 }
    299 
    300 static void
    301 micro_dseq(union tgsi_double_channel *dst,
    302            const union tgsi_double_channel *src)
    303 {
    304    dst->u[0][0] = src[0].d[0] == src[1].d[0] ? ~0U : 0U;
    305    dst->u[1][0] = src[0].d[1] == src[1].d[1] ? ~0U : 0U;
    306    dst->u[2][0] = src[0].d[2] == src[1].d[2] ? ~0U : 0U;
    307    dst->u[3][0] = src[0].d[3] == src[1].d[3] ? ~0U : 0U;
    308 }
    309 
    310 static void
    311 micro_drcp(union tgsi_double_channel *dst,
    312            const union tgsi_double_channel *src)
    313 {
    314    dst->d[0] = 1.0 / src->d[0];
    315    dst->d[1] = 1.0 / src->d[1];
    316    dst->d[2] = 1.0 / src->d[2];
    317    dst->d[3] = 1.0 / src->d[3];
    318 }
    319 
    320 static void
    321 micro_dsqrt(union tgsi_double_channel *dst,
    322             const union tgsi_double_channel *src)
    323 {
    324    dst->d[0] = sqrt(src->d[0]);
    325    dst->d[1] = sqrt(src->d[1]);
    326    dst->d[2] = sqrt(src->d[2]);
    327    dst->d[3] = sqrt(src->d[3]);
    328 }
    329 
    330 static void
    331 micro_drsq(union tgsi_double_channel *dst,
    332           const union tgsi_double_channel *src)
    333 {
    334    dst->d[0] = 1.0 / sqrt(src->d[0]);
    335    dst->d[1] = 1.0 / sqrt(src->d[1]);
    336    dst->d[2] = 1.0 / sqrt(src->d[2]);
    337    dst->d[3] = 1.0 / sqrt(src->d[3]);
    338 }
    339 
    340 static void
    341 micro_dmad(union tgsi_double_channel *dst,
    342            const union tgsi_double_channel *src)
    343 {
    344    dst->d[0] = src[0].d[0] * src[1].d[0] + src[2].d[0];
    345    dst->d[1] = src[0].d[1] * src[1].d[1] + src[2].d[1];
    346    dst->d[2] = src[0].d[2] * src[1].d[2] + src[2].d[2];
    347    dst->d[3] = src[0].d[3] * src[1].d[3] + src[2].d[3];
    348 }
    349 
    350 static void
    351 micro_dfrac(union tgsi_double_channel *dst,
    352             const union tgsi_double_channel *src)
    353 {
    354    dst->d[0] = src->d[0] - floor(src->d[0]);
    355    dst->d[1] = src->d[1] - floor(src->d[1]);
    356    dst->d[2] = src->d[2] - floor(src->d[2]);
    357    dst->d[3] = src->d[3] - floor(src->d[3]);
    358 }
    359 
    360 static void
    361 micro_dldexp(union tgsi_double_channel *dst,
    362              const union tgsi_double_channel *src0,
    363              union tgsi_exec_channel *src1)
    364 {
    365    dst->d[0] = ldexp(src0->d[0], src1->i[0]);
    366    dst->d[1] = ldexp(src0->d[1], src1->i[1]);
    367    dst->d[2] = ldexp(src0->d[2], src1->i[2]);
    368    dst->d[3] = ldexp(src0->d[3], src1->i[3]);
    369 }
    370 
    371 static void
    372 micro_dfracexp(union tgsi_double_channel *dst,
    373                union tgsi_exec_channel *dst_exp,
    374                const union tgsi_double_channel *src)
    375 {
    376    dst->d[0] = frexp(src->d[0], &dst_exp->i[0]);
    377    dst->d[1] = frexp(src->d[1], &dst_exp->i[1]);
    378    dst->d[2] = frexp(src->d[2], &dst_exp->i[2]);
    379    dst->d[3] = frexp(src->d[3], &dst_exp->i[3]);
    380 }
    381 
    382 static void
    383 micro_exp2(union tgsi_exec_channel *dst,
    384            const union tgsi_exec_channel *src)
    385 {
    386 #if FAST_MATH
    387    dst->f[0] = util_fast_exp2(src->f[0]);
    388    dst->f[1] = util_fast_exp2(src->f[1]);
    389    dst->f[2] = util_fast_exp2(src->f[2]);
    390    dst->f[3] = util_fast_exp2(src->f[3]);
    391 #else
    392 #if DEBUG
    393    /* Inf is okay for this instruction, so clamp it to silence assertions. */
    394    uint i;
    395    union tgsi_exec_channel clamped;
    396 
    397    for (i = 0; i < 4; i++) {
    398       if (src->f[i] > 127.99999f) {
    399          clamped.f[i] = 127.99999f;
    400       } else if (src->f[i] < -126.99999f) {
    401          clamped.f[i] = -126.99999f;
    402       } else {
    403          clamped.f[i] = src->f[i];
    404       }
    405    }
    406    src = &clamped;
    407 #endif /* DEBUG */
    408 
    409    dst->f[0] = powf(2.0f, src->f[0]);
    410    dst->f[1] = powf(2.0f, src->f[1]);
    411    dst->f[2] = powf(2.0f, src->f[2]);
    412    dst->f[3] = powf(2.0f, src->f[3]);
    413 #endif /* FAST_MATH */
    414 }
    415 
    416 static void
    417 micro_f2d(union tgsi_double_channel *dst,
    418           const union tgsi_exec_channel *src)
    419 {
    420    dst->d[0] = (double)src->f[0];
    421    dst->d[1] = (double)src->f[1];
    422    dst->d[2] = (double)src->f[2];
    423    dst->d[3] = (double)src->f[3];
    424 }
    425 
    426 static void
    427 micro_flr(union tgsi_exec_channel *dst,
    428           const union tgsi_exec_channel *src)
    429 {
    430    dst->f[0] = floorf(src->f[0]);
    431    dst->f[1] = floorf(src->f[1]);
    432    dst->f[2] = floorf(src->f[2]);
    433    dst->f[3] = floorf(src->f[3]);
    434 }
    435 
    436 static void
    437 micro_frc(union tgsi_exec_channel *dst,
    438           const union tgsi_exec_channel *src)
    439 {
    440    dst->f[0] = src->f[0] - floorf(src->f[0]);
    441    dst->f[1] = src->f[1] - floorf(src->f[1]);
    442    dst->f[2] = src->f[2] - floorf(src->f[2]);
    443    dst->f[3] = src->f[3] - floorf(src->f[3]);
    444 }
    445 
    446 static void
    447 micro_i2d(union tgsi_double_channel *dst,
    448           const union tgsi_exec_channel *src)
    449 {
    450    dst->d[0] = (double)src->i[0];
    451    dst->d[1] = (double)src->i[1];
    452    dst->d[2] = (double)src->i[2];
    453    dst->d[3] = (double)src->i[3];
    454 }
    455 
    456 static void
    457 micro_iabs(union tgsi_exec_channel *dst,
    458            const union tgsi_exec_channel *src)
    459 {
    460    dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
    461    dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
    462    dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
    463    dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
    464 }
    465 
    466 static void
    467 micro_ineg(union tgsi_exec_channel *dst,
    468            const union tgsi_exec_channel *src)
    469 {
    470    dst->i[0] = -src->i[0];
    471    dst->i[1] = -src->i[1];
    472    dst->i[2] = -src->i[2];
    473    dst->i[3] = -src->i[3];
    474 }
    475 
    476 static void
    477 micro_lg2(union tgsi_exec_channel *dst,
    478           const union tgsi_exec_channel *src)
    479 {
    480 #if FAST_MATH
    481    dst->f[0] = util_fast_log2(src->f[0]);
    482    dst->f[1] = util_fast_log2(src->f[1]);
    483    dst->f[2] = util_fast_log2(src->f[2]);
    484    dst->f[3] = util_fast_log2(src->f[3]);
    485 #else
    486    dst->f[0] = logf(src->f[0]) * 1.442695f;
    487    dst->f[1] = logf(src->f[1]) * 1.442695f;
    488    dst->f[2] = logf(src->f[2]) * 1.442695f;
    489    dst->f[3] = logf(src->f[3]) * 1.442695f;
    490 #endif
    491 }
    492 
    493 static void
    494 micro_lrp(union tgsi_exec_channel *dst,
    495           const union tgsi_exec_channel *src0,
    496           const union tgsi_exec_channel *src1,
    497           const union tgsi_exec_channel *src2)
    498 {
    499    dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
    500    dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
    501    dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
    502    dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
    503 }
    504 
    505 static void
    506 micro_mad(union tgsi_exec_channel *dst,
    507           const union tgsi_exec_channel *src0,
    508           const union tgsi_exec_channel *src1,
    509           const union tgsi_exec_channel *src2)
    510 {
    511    dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
    512    dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
    513    dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
    514    dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
    515 }
    516 
    517 static void
    518 micro_mov(union tgsi_exec_channel *dst,
    519           const union tgsi_exec_channel *src)
    520 {
    521    dst->u[0] = src->u[0];
    522    dst->u[1] = src->u[1];
    523    dst->u[2] = src->u[2];
    524    dst->u[3] = src->u[3];
    525 }
    526 
    527 static void
    528 micro_rcp(union tgsi_exec_channel *dst,
    529           const union tgsi_exec_channel *src)
    530 {
    531 #if 0 /* for debugging */
    532    assert(src->f[0] != 0.0f);
    533    assert(src->f[1] != 0.0f);
    534    assert(src->f[2] != 0.0f);
    535    assert(src->f[3] != 0.0f);
    536 #endif
    537    dst->f[0] = 1.0f / src->f[0];
    538    dst->f[1] = 1.0f / src->f[1];
    539    dst->f[2] = 1.0f / src->f[2];
    540    dst->f[3] = 1.0f / src->f[3];
    541 }
    542 
    543 static void
    544 micro_rnd(union tgsi_exec_channel *dst,
    545           const union tgsi_exec_channel *src)
    546 {
    547    dst->f[0] = _mesa_roundevenf(src->f[0]);
    548    dst->f[1] = _mesa_roundevenf(src->f[1]);
    549    dst->f[2] = _mesa_roundevenf(src->f[2]);
    550    dst->f[3] = _mesa_roundevenf(src->f[3]);
    551 }
    552 
    553 static void
    554 micro_rsq(union tgsi_exec_channel *dst,
    555           const union tgsi_exec_channel *src)
    556 {
    557 #if 0 /* for debugging */
    558    assert(src->f[0] != 0.0f);
    559    assert(src->f[1] != 0.0f);
    560    assert(src->f[2] != 0.0f);
    561    assert(src->f[3] != 0.0f);
    562 #endif
    563    dst->f[0] = 1.0f / sqrtf(src->f[0]);
    564    dst->f[1] = 1.0f / sqrtf(src->f[1]);
    565    dst->f[2] = 1.0f / sqrtf(src->f[2]);
    566    dst->f[3] = 1.0f / sqrtf(src->f[3]);
    567 }
    568 
    569 static void
    570 micro_sqrt(union tgsi_exec_channel *dst,
    571            const union tgsi_exec_channel *src)
    572 {
    573    dst->f[0] = sqrtf(src->f[0]);
    574    dst->f[1] = sqrtf(src->f[1]);
    575    dst->f[2] = sqrtf(src->f[2]);
    576    dst->f[3] = sqrtf(src->f[3]);
    577 }
    578 
    579 static void
    580 micro_seq(union tgsi_exec_channel *dst,
    581           const union tgsi_exec_channel *src0,
    582           const union tgsi_exec_channel *src1)
    583 {
    584    dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
    585    dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
    586    dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
    587    dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
    588 }
    589 
    590 static void
    591 micro_sge(union tgsi_exec_channel *dst,
    592           const union tgsi_exec_channel *src0,
    593           const union tgsi_exec_channel *src1)
    594 {
    595    dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
    596    dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
    597    dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
    598    dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
    599 }
    600 
    601 static void
    602 micro_sgn(union tgsi_exec_channel *dst,
    603           const union tgsi_exec_channel *src)
    604 {
    605    dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
    606    dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
    607    dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
    608    dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
    609 }
    610 
    611 static void
    612 micro_isgn(union tgsi_exec_channel *dst,
    613           const union tgsi_exec_channel *src)
    614 {
    615    dst->i[0] = src->i[0] < 0 ? -1 : src->i[0] > 0 ? 1 : 0;
    616    dst->i[1] = src->i[1] < 0 ? -1 : src->i[1] > 0 ? 1 : 0;
    617    dst->i[2] = src->i[2] < 0 ? -1 : src->i[2] > 0 ? 1 : 0;
    618    dst->i[3] = src->i[3] < 0 ? -1 : src->i[3] > 0 ? 1 : 0;
    619 }
    620 
    621 static void
    622 micro_sgt(union tgsi_exec_channel *dst,
    623           const union tgsi_exec_channel *src0,
    624           const union tgsi_exec_channel *src1)
    625 {
    626    dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
    627    dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
    628    dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
    629    dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
    630 }
    631 
    632 static void
    633 micro_sin(union tgsi_exec_channel *dst,
    634           const union tgsi_exec_channel *src)
    635 {
    636    dst->f[0] = sinf(src->f[0]);
    637    dst->f[1] = sinf(src->f[1]);
    638    dst->f[2] = sinf(src->f[2]);
    639    dst->f[3] = sinf(src->f[3]);
    640 }
    641 
    642 static void
    643 micro_sle(union tgsi_exec_channel *dst,
    644           const union tgsi_exec_channel *src0,
    645           const union tgsi_exec_channel *src1)
    646 {
    647    dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
    648    dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
    649    dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
    650    dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
    651 }
    652 
    653 static void
    654 micro_slt(union tgsi_exec_channel *dst,
    655           const union tgsi_exec_channel *src0,
    656           const union tgsi_exec_channel *src1)
    657 {
    658    dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
    659    dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
    660    dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
    661    dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
    662 }
    663 
    664 static void
    665 micro_sne(union tgsi_exec_channel *dst,
    666           const union tgsi_exec_channel *src0,
    667           const union tgsi_exec_channel *src1)
    668 {
    669    dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
    670    dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
    671    dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
    672    dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
    673 }
    674 
    675 static void
    676 micro_trunc(union tgsi_exec_channel *dst,
    677             const union tgsi_exec_channel *src)
    678 {
    679    dst->f[0] = truncf(src->f[0]);
    680    dst->f[1] = truncf(src->f[1]);
    681    dst->f[2] = truncf(src->f[2]);
    682    dst->f[3] = truncf(src->f[3]);
    683 }
    684 
    685 static void
    686 micro_u2d(union tgsi_double_channel *dst,
    687           const union tgsi_exec_channel *src)
    688 {
    689    dst->d[0] = (double)src->u[0];
    690    dst->d[1] = (double)src->u[1];
    691    dst->d[2] = (double)src->u[2];
    692    dst->d[3] = (double)src->u[3];
    693 }
    694 
    695 static void
    696 micro_i64abs(union tgsi_double_channel *dst,
    697              const union tgsi_double_channel *src)
    698 {
    699    dst->i64[0] = src->i64[0] >= 0.0 ? src->i64[0] : -src->i64[0];
    700    dst->i64[1] = src->i64[1] >= 0.0 ? src->i64[1] : -src->i64[1];
    701    dst->i64[2] = src->i64[2] >= 0.0 ? src->i64[2] : -src->i64[2];
    702    dst->i64[3] = src->i64[3] >= 0.0 ? src->i64[3] : -src->i64[3];
    703 }
    704 
    705 static void
    706 micro_i64sgn(union tgsi_double_channel *dst,
    707              const union tgsi_double_channel *src)
    708 {
    709    dst->i64[0] = src->i64[0] < 0 ? -1 : src->i64[0] > 0 ? 1 : 0;
    710    dst->i64[1] = src->i64[1] < 0 ? -1 : src->i64[1] > 0 ? 1 : 0;
    711    dst->i64[2] = src->i64[2] < 0 ? -1 : src->i64[2] > 0 ? 1 : 0;
    712    dst->i64[3] = src->i64[3] < 0 ? -1 : src->i64[3] > 0 ? 1 : 0;
    713 }
    714 
    715 static void
    716 micro_i64neg(union tgsi_double_channel *dst,
    717              const union tgsi_double_channel *src)
    718 {
    719    dst->i64[0] = -src->i64[0];
    720    dst->i64[1] = -src->i64[1];
    721    dst->i64[2] = -src->i64[2];
    722    dst->i64[3] = -src->i64[3];
    723 }
    724 
    725 static void
    726 micro_u64seq(union tgsi_double_channel *dst,
    727            const union tgsi_double_channel *src)
    728 {
    729    dst->u[0][0] = src[0].u64[0] == src[1].u64[0] ? ~0U : 0U;
    730    dst->u[1][0] = src[0].u64[1] == src[1].u64[1] ? ~0U : 0U;
    731    dst->u[2][0] = src[0].u64[2] == src[1].u64[2] ? ~0U : 0U;
    732    dst->u[3][0] = src[0].u64[3] == src[1].u64[3] ? ~0U : 0U;
    733 }
    734 
    735 static void
    736 micro_u64sne(union tgsi_double_channel *dst,
    737              const union tgsi_double_channel *src)
    738 {
    739    dst->u[0][0] = src[0].u64[0] != src[1].u64[0] ? ~0U : 0U;
    740    dst->u[1][0] = src[0].u64[1] != src[1].u64[1] ? ~0U : 0U;
    741    dst->u[2][0] = src[0].u64[2] != src[1].u64[2] ? ~0U : 0U;
    742    dst->u[3][0] = src[0].u64[3] != src[1].u64[3] ? ~0U : 0U;
    743 }
    744 
    745 static void
    746 micro_i64slt(union tgsi_double_channel *dst,
    747              const union tgsi_double_channel *src)
    748 {
    749    dst->u[0][0] = src[0].i64[0] < src[1].i64[0] ? ~0U : 0U;
    750    dst->u[1][0] = src[0].i64[1] < src[1].i64[1] ? ~0U : 0U;
    751    dst->u[2][0] = src[0].i64[2] < src[1].i64[2] ? ~0U : 0U;
    752    dst->u[3][0] = src[0].i64[3] < src[1].i64[3] ? ~0U : 0U;
    753 }
    754 
    755 static void
    756 micro_u64slt(union tgsi_double_channel *dst,
    757              const union tgsi_double_channel *src)
    758 {
    759    dst->u[0][0] = src[0].u64[0] < src[1].u64[0] ? ~0U : 0U;
    760    dst->u[1][0] = src[0].u64[1] < src[1].u64[1] ? ~0U : 0U;
    761    dst->u[2][0] = src[0].u64[2] < src[1].u64[2] ? ~0U : 0U;
    762    dst->u[3][0] = src[0].u64[3] < src[1].u64[3] ? ~0U : 0U;
    763 }
    764 
    765 static void
    766 micro_i64sge(union tgsi_double_channel *dst,
    767            const union tgsi_double_channel *src)
    768 {
    769    dst->u[0][0] = src[0].i64[0] >= src[1].i64[0] ? ~0U : 0U;
    770    dst->u[1][0] = src[0].i64[1] >= src[1].i64[1] ? ~0U : 0U;
    771    dst->u[2][0] = src[0].i64[2] >= src[1].i64[2] ? ~0U : 0U;
    772    dst->u[3][0] = src[0].i64[3] >= src[1].i64[3] ? ~0U : 0U;
    773 }
    774 
    775 static void
    776 micro_u64sge(union tgsi_double_channel *dst,
    777              const union tgsi_double_channel *src)
    778 {
    779    dst->u[0][0] = src[0].u64[0] >= src[1].u64[0] ? ~0U : 0U;
    780    dst->u[1][0] = src[0].u64[1] >= src[1].u64[1] ? ~0U : 0U;
    781    dst->u[2][0] = src[0].u64[2] >= src[1].u64[2] ? ~0U : 0U;
    782    dst->u[3][0] = src[0].u64[3] >= src[1].u64[3] ? ~0U : 0U;
    783 }
    784 
    785 static void
    786 micro_u64max(union tgsi_double_channel *dst,
    787              const union tgsi_double_channel *src)
    788 {
    789    dst->u64[0] = src[0].u64[0] > src[1].u64[0] ? src[0].u64[0] : src[1].u64[0];
    790    dst->u64[1] = src[0].u64[1] > src[1].u64[1] ? src[0].u64[1] : src[1].u64[1];
    791    dst->u64[2] = src[0].u64[2] > src[1].u64[2] ? src[0].u64[2] : src[1].u64[2];
    792    dst->u64[3] = src[0].u64[3] > src[1].u64[3] ? src[0].u64[3] : src[1].u64[3];
    793 }
    794 
    795 static void
    796 micro_i64max(union tgsi_double_channel *dst,
    797              const union tgsi_double_channel *src)
    798 {
    799    dst->i64[0] = src[0].i64[0] > src[1].i64[0] ? src[0].i64[0] : src[1].i64[0];
    800    dst->i64[1] = src[0].i64[1] > src[1].i64[1] ? src[0].i64[1] : src[1].i64[1];
    801    dst->i64[2] = src[0].i64[2] > src[1].i64[2] ? src[0].i64[2] : src[1].i64[2];
    802    dst->i64[3] = src[0].i64[3] > src[1].i64[3] ? src[0].i64[3] : src[1].i64[3];
    803 }
    804 
    805 static void
    806 micro_u64min(union tgsi_double_channel *dst,
    807              const union tgsi_double_channel *src)
    808 {
    809    dst->u64[0] = src[0].u64[0] < src[1].u64[0] ? src[0].u64[0] : src[1].u64[0];
    810    dst->u64[1] = src[0].u64[1] < src[1].u64[1] ? src[0].u64[1] : src[1].u64[1];
    811    dst->u64[2] = src[0].u64[2] < src[1].u64[2] ? src[0].u64[2] : src[1].u64[2];
    812    dst->u64[3] = src[0].u64[3] < src[1].u64[3] ? src[0].u64[3] : src[1].u64[3];
    813 }
    814 
    815 static void
    816 micro_i64min(union tgsi_double_channel *dst,
    817              const union tgsi_double_channel *src)
    818 {
    819    dst->i64[0] = src[0].i64[0] < src[1].i64[0] ? src[0].i64[0] : src[1].i64[0];
    820    dst->i64[1] = src[0].i64[1] < src[1].i64[1] ? src[0].i64[1] : src[1].i64[1];
    821    dst->i64[2] = src[0].i64[2] < src[1].i64[2] ? src[0].i64[2] : src[1].i64[2];
    822    dst->i64[3] = src[0].i64[3] < src[1].i64[3] ? src[0].i64[3] : src[1].i64[3];
    823 }
    824 
    825 static void
    826 micro_u64add(union tgsi_double_channel *dst,
    827              const union tgsi_double_channel *src)
    828 {
    829    dst->u64[0] = src[0].u64[0] + src[1].u64[0];
    830    dst->u64[1] = src[0].u64[1] + src[1].u64[1];
    831    dst->u64[2] = src[0].u64[2] + src[1].u64[2];
    832    dst->u64[3] = src[0].u64[3] + src[1].u64[3];
    833 }
    834 
    835 static void
    836 micro_u64mul(union tgsi_double_channel *dst,
    837              const union tgsi_double_channel *src)
    838 {
    839    dst->u64[0] = src[0].u64[0] * src[1].u64[0];
    840    dst->u64[1] = src[0].u64[1] * src[1].u64[1];
    841    dst->u64[2] = src[0].u64[2] * src[1].u64[2];
    842    dst->u64[3] = src[0].u64[3] * src[1].u64[3];
    843 }
    844 
    845 static void
    846 micro_u64div(union tgsi_double_channel *dst,
    847              const union tgsi_double_channel *src)
    848 {
    849    dst->u64[0] = src[1].u64[0] ? src[0].u64[0] / src[1].u64[0] : ~0ull;
    850    dst->u64[1] = src[1].u64[1] ? src[0].u64[1] / src[1].u64[1] : ~0ull;
    851    dst->u64[2] = src[1].u64[2] ? src[0].u64[2] / src[1].u64[2] : ~0ull;
    852    dst->u64[3] = src[1].u64[3] ? src[0].u64[3] / src[1].u64[3] : ~0ull;
    853 }
    854 
    855 static void
    856 micro_i64div(union tgsi_double_channel *dst,
    857              const union tgsi_double_channel *src)
    858 {
    859    dst->i64[0] = src[1].i64[0] ? src[0].i64[0] / src[1].i64[0] : 0;
    860    dst->i64[1] = src[1].i64[1] ? src[0].i64[1] / src[1].i64[1] : 0;
    861    dst->i64[2] = src[1].i64[2] ? src[0].i64[2] / src[1].i64[2] : 0;
    862    dst->i64[3] = src[1].i64[3] ? src[0].i64[3] / src[1].i64[3] : 0;
    863 }
    864 
    865 static void
    866 micro_u64mod(union tgsi_double_channel *dst,
    867              const union tgsi_double_channel *src)
    868 {
    869    dst->u64[0] = src[1].u64[0] ? src[0].u64[0] % src[1].u64[0] : ~0ull;
    870    dst->u64[1] = src[1].u64[1] ? src[0].u64[1] % src[1].u64[1] : ~0ull;
    871    dst->u64[2] = src[1].u64[2] ? src[0].u64[2] % src[1].u64[2] : ~0ull;
    872    dst->u64[3] = src[1].u64[3] ? src[0].u64[3] % src[1].u64[3] : ~0ull;
    873 }
    874 
    875 static void
    876 micro_i64mod(union tgsi_double_channel *dst,
    877              const union tgsi_double_channel *src)
    878 {
    879    dst->i64[0] = src[1].i64[0] ? src[0].i64[0] % src[1].i64[0] : ~0ll;
    880    dst->i64[1] = src[1].i64[1] ? src[0].i64[1] % src[1].i64[1] : ~0ll;
    881    dst->i64[2] = src[1].i64[2] ? src[0].i64[2] % src[1].i64[2] : ~0ll;
    882    dst->i64[3] = src[1].i64[3] ? src[0].i64[3] % src[1].i64[3] : ~0ll;
    883 }
    884 
    885 static void
    886 micro_u64shl(union tgsi_double_channel *dst,
    887              const union tgsi_double_channel *src0,
    888              union tgsi_exec_channel *src1)
    889 {
    890    unsigned masked_count;
    891    masked_count = src1->u[0] & 0x3f;
    892    dst->u64[0] = src0->u64[0] << masked_count;
    893    masked_count = src1->u[1] & 0x3f;
    894    dst->u64[1] = src0->u64[1] << masked_count;
    895    masked_count = src1->u[2] & 0x3f;
    896    dst->u64[2] = src0->u64[2] << masked_count;
    897    masked_count = src1->u[3] & 0x3f;
    898    dst->u64[3] = src0->u64[3] << masked_count;
    899 }
    900 
    901 static void
    902 micro_i64shr(union tgsi_double_channel *dst,
    903              const union tgsi_double_channel *src0,
    904              union tgsi_exec_channel *src1)
    905 {
    906    unsigned masked_count;
    907    masked_count = src1->u[0] & 0x3f;
    908    dst->i64[0] = src0->i64[0] >> masked_count;
    909    masked_count = src1->u[1] & 0x3f;
    910    dst->i64[1] = src0->i64[1] >> masked_count;
    911    masked_count = src1->u[2] & 0x3f;
    912    dst->i64[2] = src0->i64[2] >> masked_count;
    913    masked_count = src1->u[3] & 0x3f;
    914    dst->i64[3] = src0->i64[3] >> masked_count;
    915 }
    916 
    917 static void
    918 micro_u64shr(union tgsi_double_channel *dst,
    919              const union tgsi_double_channel *src0,
    920              union tgsi_exec_channel *src1)
    921 {
    922    unsigned masked_count;
    923    masked_count = src1->u[0] & 0x3f;
    924    dst->u64[0] = src0->u64[0] >> masked_count;
    925    masked_count = src1->u[1] & 0x3f;
    926    dst->u64[1] = src0->u64[1] >> masked_count;
    927    masked_count = src1->u[2] & 0x3f;
    928    dst->u64[2] = src0->u64[2] >> masked_count;
    929    masked_count = src1->u[3] & 0x3f;
    930    dst->u64[3] = src0->u64[3] >> masked_count;
    931 }
    932 
    933 enum tgsi_exec_datatype {
    934    TGSI_EXEC_DATA_FLOAT,
    935    TGSI_EXEC_DATA_INT,
    936    TGSI_EXEC_DATA_UINT,
    937    TGSI_EXEC_DATA_DOUBLE,
    938    TGSI_EXEC_DATA_INT64,
    939    TGSI_EXEC_DATA_UINT64,
    940 };
    941 
    942 /*
    943  * Shorthand locations of various utility registers (_I = Index, _C = Channel)
    944  */
    945 #define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
    946 #define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
    947 #define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
    948 #define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
    949 #define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
    950 #define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
    951 
    952 
    953 /** The execution mask depends on the conditional mask and the loop mask */
    954 #define UPDATE_EXEC_MASK(MACH) \
    955       MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
    956 
    957 
    958 static const union tgsi_exec_channel ZeroVec =
    959    { { 0.0, 0.0, 0.0, 0.0 } };
    960 
    961 static const union tgsi_exec_channel OneVec = {
    962    {1.0f, 1.0f, 1.0f, 1.0f}
    963 };
    964 
    965 static const union tgsi_exec_channel P128Vec = {
    966    {128.0f, 128.0f, 128.0f, 128.0f}
    967 };
    968 
    969 static const union tgsi_exec_channel M128Vec = {
    970    {-128.0f, -128.0f, -128.0f, -128.0f}
    971 };
    972 
    973 
    974 /**
    975  * Assert that none of the float values in 'chan' are infinite or NaN.
    976  * NaN and Inf may occur normally during program execution and should
    977  * not lead to crashes, etc.  But when debugging, it's helpful to catch
    978  * them.
    979  */
    980 static inline void
    981 check_inf_or_nan(const union tgsi_exec_channel *chan)
    982 {
    983    assert(!util_is_inf_or_nan((chan)->f[0]));
    984    assert(!util_is_inf_or_nan((chan)->f[1]));
    985    assert(!util_is_inf_or_nan((chan)->f[2]));
    986    assert(!util_is_inf_or_nan((chan)->f[3]));
    987 }
    988 
    989 
    990 #ifdef DEBUG
    991 static void
    992 print_chan(const char *msg, const union tgsi_exec_channel *chan)
    993 {
    994    debug_printf("%s = {%f, %f, %f, %f}\n",
    995                 msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
    996 }
    997 #endif
    998 
    999 
   1000 #ifdef DEBUG
   1001 static void
   1002 print_temp(const struct tgsi_exec_machine *mach, uint index)
   1003 {
   1004    const struct tgsi_exec_vector *tmp = &mach->Temps[index];
   1005    int i;
   1006    debug_printf("Temp[%u] =\n", index);
   1007    for (i = 0; i < 4; i++) {
   1008       debug_printf("  %c: { %f, %f, %f, %f }\n",
   1009                    "XYZW"[i],
   1010                    tmp->xyzw[i].f[0],
   1011                    tmp->xyzw[i].f[1],
   1012                    tmp->xyzw[i].f[2],
   1013                    tmp->xyzw[i].f[3]);
   1014    }
   1015 }
   1016 #endif
   1017 
   1018 
   1019 void
   1020 tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
   1021                                unsigned num_bufs,
   1022                                const void **bufs,
   1023                                const unsigned *buf_sizes)
   1024 {
   1025    unsigned i;
   1026 
   1027    for (i = 0; i < num_bufs; i++) {
   1028       mach->Consts[i] = bufs[i];
   1029       mach->ConstsSize[i] = buf_sizes[i];
   1030    }
   1031 }
   1032 
   1033 
   1034 /**
   1035  * Check if there's a potential src/dst register data dependency when
   1036  * using SOA execution.
   1037  * Example:
   1038  *   MOV T, T.yxwz;
   1039  * This would expand into:
   1040  *   MOV t0, t1;
   1041  *   MOV t1, t0;
   1042  *   MOV t2, t3;
   1043  *   MOV t3, t2;
   1044  * The second instruction will have the wrong value for t0 if executed as-is.
   1045  */
   1046 boolean
   1047 tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
   1048 {
   1049    uint i, chan;
   1050 
   1051    uint writemask = inst->Dst[0].Register.WriteMask;
   1052    if (writemask == TGSI_WRITEMASK_X ||
   1053        writemask == TGSI_WRITEMASK_Y ||
   1054        writemask == TGSI_WRITEMASK_Z ||
   1055        writemask == TGSI_WRITEMASK_W ||
   1056        writemask == TGSI_WRITEMASK_NONE) {
   1057       /* no chance of data dependency */
   1058       return FALSE;
   1059    }
   1060 
   1061    /* loop over src regs */
   1062    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
   1063       if ((inst->Src[i].Register.File ==
   1064            inst->Dst[0].Register.File) &&
   1065           ((inst->Src[i].Register.Index ==
   1066             inst->Dst[0].Register.Index) ||
   1067            inst->Src[i].Register.Indirect ||
   1068            inst->Dst[0].Register.Indirect)) {
   1069          /* loop over dest channels */
   1070          uint channelsWritten = 0x0;
   1071          for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   1072             if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   1073                /* check if we're reading a channel that's been written */
   1074                uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
   1075                if (channelsWritten & (1 << swizzle)) {
   1076                   return TRUE;
   1077                }
   1078 
   1079                channelsWritten |= (1 << chan);
   1080             }
   1081          }
   1082       }
   1083    }
   1084    return FALSE;
   1085 }
   1086 
   1087 
   1088 /**
   1089  * Initialize machine state by expanding tokens to full instructions,
   1090  * allocating temporary storage, setting up constants, etc.
   1091  * After this, we can call tgsi_exec_machine_run() many times.
   1092  */
   1093 void
   1094 tgsi_exec_machine_bind_shader(
   1095    struct tgsi_exec_machine *mach,
   1096    const struct tgsi_token *tokens,
   1097    struct tgsi_sampler *sampler,
   1098    struct tgsi_image *image,
   1099    struct tgsi_buffer *buffer)
   1100 {
   1101    uint k;
   1102    struct tgsi_parse_context parse;
   1103    struct tgsi_full_instruction *instructions;
   1104    struct tgsi_full_declaration *declarations;
   1105    uint maxInstructions = 10, numInstructions = 0;
   1106    uint maxDeclarations = 10, numDeclarations = 0;
   1107 
   1108 #if 0
   1109    tgsi_dump(tokens, 0);
   1110 #endif
   1111 
   1112    util_init_math();
   1113 
   1114 
   1115    mach->Tokens = tokens;
   1116    mach->Sampler = sampler;
   1117    mach->Image = image;
   1118    mach->Buffer = buffer;
   1119 
   1120    if (!tokens) {
   1121       /* unbind and free all */
   1122       FREE(mach->Declarations);
   1123       mach->Declarations = NULL;
   1124       mach->NumDeclarations = 0;
   1125 
   1126       FREE(mach->Instructions);
   1127       mach->Instructions = NULL;
   1128       mach->NumInstructions = 0;
   1129 
   1130       return;
   1131    }
   1132 
   1133    k = tgsi_parse_init (&parse, mach->Tokens);
   1134    if (k != TGSI_PARSE_OK) {
   1135       debug_printf( "Problem parsing!\n" );
   1136       return;
   1137    }
   1138 
   1139    mach->ImmLimit = 0;
   1140    mach->NumOutputs = 0;
   1141 
   1142    for (k = 0; k < TGSI_SEMANTIC_COUNT; k++)
   1143       mach->SysSemanticToIndex[k] = -1;
   1144 
   1145    if (mach->ShaderType == PIPE_SHADER_GEOMETRY &&
   1146        !mach->UsedGeometryShader) {
   1147       struct tgsi_exec_vector *inputs;
   1148       struct tgsi_exec_vector *outputs;
   1149 
   1150       inputs = align_malloc(sizeof(struct tgsi_exec_vector) *
   1151                             TGSI_MAX_PRIM_VERTICES * PIPE_MAX_SHADER_INPUTS,
   1152                             16);
   1153 
   1154       if (!inputs)
   1155          return;
   1156 
   1157       outputs = align_malloc(sizeof(struct tgsi_exec_vector) *
   1158                              TGSI_MAX_TOTAL_VERTICES, 16);
   1159 
   1160       if (!outputs) {
   1161          align_free(inputs);
   1162          return;
   1163       }
   1164 
   1165       align_free(mach->Inputs);
   1166       align_free(mach->Outputs);
   1167 
   1168       mach->Inputs = inputs;
   1169       mach->Outputs = outputs;
   1170       mach->UsedGeometryShader = TRUE;
   1171    }
   1172 
   1173    declarations = (struct tgsi_full_declaration *)
   1174       MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
   1175 
   1176    if (!declarations) {
   1177       return;
   1178    }
   1179 
   1180    instructions = (struct tgsi_full_instruction *)
   1181       MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
   1182 
   1183    if (!instructions) {
   1184       FREE( declarations );
   1185       return;
   1186    }
   1187 
   1188    while( !tgsi_parse_end_of_tokens( &parse ) ) {
   1189       uint i;
   1190 
   1191       tgsi_parse_token( &parse );
   1192       switch( parse.FullToken.Token.Type ) {
   1193       case TGSI_TOKEN_TYPE_DECLARATION:
   1194          /* save expanded declaration */
   1195          if (numDeclarations == maxDeclarations) {
   1196             declarations = REALLOC(declarations,
   1197                                    maxDeclarations
   1198                                    * sizeof(struct tgsi_full_declaration),
   1199                                    (maxDeclarations + 10)
   1200                                    * sizeof(struct tgsi_full_declaration));
   1201             maxDeclarations += 10;
   1202          }
   1203          if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
   1204             unsigned reg;
   1205             for (reg = parse.FullToken.FullDeclaration.Range.First;
   1206                  reg <= parse.FullToken.FullDeclaration.Range.Last;
   1207                  ++reg) {
   1208                ++mach->NumOutputs;
   1209             }
   1210          }
   1211          else if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
   1212             const struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
   1213             mach->SysSemanticToIndex[decl->Semantic.Name] = decl->Range.First;
   1214          }
   1215 
   1216          memcpy(declarations + numDeclarations,
   1217                 &parse.FullToken.FullDeclaration,
   1218                 sizeof(declarations[0]));
   1219          numDeclarations++;
   1220          break;
   1221 
   1222       case TGSI_TOKEN_TYPE_IMMEDIATE:
   1223          {
   1224             uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
   1225             assert( size <= 4 );
   1226             assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
   1227 
   1228             for( i = 0; i < size; i++ ) {
   1229                mach->Imms[mach->ImmLimit][i] =
   1230 		  parse.FullToken.FullImmediate.u[i].Float;
   1231             }
   1232             mach->ImmLimit += 1;
   1233          }
   1234          break;
   1235 
   1236       case TGSI_TOKEN_TYPE_INSTRUCTION:
   1237 
   1238          /* save expanded instruction */
   1239          if (numInstructions == maxInstructions) {
   1240             instructions = REALLOC(instructions,
   1241                                    maxInstructions
   1242                                    * sizeof(struct tgsi_full_instruction),
   1243                                    (maxInstructions + 10)
   1244                                    * sizeof(struct tgsi_full_instruction));
   1245             maxInstructions += 10;
   1246          }
   1247 
   1248          memcpy(instructions + numInstructions,
   1249                 &parse.FullToken.FullInstruction,
   1250                 sizeof(instructions[0]));
   1251 
   1252          numInstructions++;
   1253          break;
   1254 
   1255       case TGSI_TOKEN_TYPE_PROPERTY:
   1256          if (mach->ShaderType == PIPE_SHADER_GEOMETRY) {
   1257             if (parse.FullToken.FullProperty.Property.PropertyName == TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES) {
   1258                mach->MaxOutputVertices = parse.FullToken.FullProperty.u[0].Data;
   1259             }
   1260          }
   1261          break;
   1262 
   1263       default:
   1264          assert( 0 );
   1265       }
   1266    }
   1267    tgsi_parse_free (&parse);
   1268 
   1269    FREE(mach->Declarations);
   1270    mach->Declarations = declarations;
   1271    mach->NumDeclarations = numDeclarations;
   1272 
   1273    FREE(mach->Instructions);
   1274    mach->Instructions = instructions;
   1275    mach->NumInstructions = numInstructions;
   1276 }
   1277 
   1278 
   1279 struct tgsi_exec_machine *
   1280 tgsi_exec_machine_create(enum pipe_shader_type shader_type)
   1281 {
   1282    struct tgsi_exec_machine *mach;
   1283    uint i;
   1284 
   1285    mach = align_malloc( sizeof *mach, 16 );
   1286    if (!mach)
   1287       goto fail;
   1288 
   1289    memset(mach, 0, sizeof(*mach));
   1290 
   1291    mach->ShaderType = shader_type;
   1292    mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
   1293    mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
   1294 
   1295    if (shader_type != PIPE_SHADER_COMPUTE) {
   1296       mach->Inputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_SHADER_INPUTS, 16);
   1297       mach->Outputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_SHADER_OUTPUTS, 16);
   1298       if (!mach->Inputs || !mach->Outputs)
   1299          goto fail;
   1300    }
   1301 
   1302    /* Setup constants needed by the SSE2 executor. */
   1303    for( i = 0; i < 4; i++ ) {
   1304       mach->Temps[TGSI_EXEC_TEMP_00000000_I].xyzw[TGSI_EXEC_TEMP_00000000_C].u[i] = 0x00000000;
   1305       mach->Temps[TGSI_EXEC_TEMP_7FFFFFFF_I].xyzw[TGSI_EXEC_TEMP_7FFFFFFF_C].u[i] = 0x7FFFFFFF;
   1306       mach->Temps[TGSI_EXEC_TEMP_80000000_I].xyzw[TGSI_EXEC_TEMP_80000000_C].u[i] = 0x80000000;
   1307       mach->Temps[TGSI_EXEC_TEMP_FFFFFFFF_I].xyzw[TGSI_EXEC_TEMP_FFFFFFFF_C].u[i] = 0xFFFFFFFF;    /* not used */
   1308       mach->Temps[TGSI_EXEC_TEMP_ONE_I].xyzw[TGSI_EXEC_TEMP_ONE_C].f[i] = 1.0f;
   1309       mach->Temps[TGSI_EXEC_TEMP_TWO_I].xyzw[TGSI_EXEC_TEMP_TWO_C].f[i] = 2.0f;    /* not used */
   1310       mach->Temps[TGSI_EXEC_TEMP_128_I].xyzw[TGSI_EXEC_TEMP_128_C].f[i] = 128.0f;
   1311       mach->Temps[TGSI_EXEC_TEMP_MINUS_128_I].xyzw[TGSI_EXEC_TEMP_MINUS_128_C].f[i] = -128.0f;
   1312       mach->Temps[TGSI_EXEC_TEMP_THREE_I].xyzw[TGSI_EXEC_TEMP_THREE_C].f[i] = 3.0f;
   1313       mach->Temps[TGSI_EXEC_TEMP_HALF_I].xyzw[TGSI_EXEC_TEMP_HALF_C].f[i] = 0.5f;
   1314    }
   1315 
   1316 #ifdef DEBUG
   1317    /* silence warnings */
   1318    (void) print_chan;
   1319    (void) print_temp;
   1320 #endif
   1321 
   1322    return mach;
   1323 
   1324 fail:
   1325    if (mach) {
   1326       align_free(mach->Inputs);
   1327       align_free(mach->Outputs);
   1328       align_free(mach);
   1329    }
   1330    return NULL;
   1331 }
   1332 
   1333 
   1334 void
   1335 tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
   1336 {
   1337    if (mach) {
   1338       FREE(mach->Instructions);
   1339       FREE(mach->Declarations);
   1340 
   1341       align_free(mach->Inputs);
   1342       align_free(mach->Outputs);
   1343 
   1344       align_free(mach);
   1345    }
   1346 }
   1347 
   1348 static void
   1349 micro_add(union tgsi_exec_channel *dst,
   1350           const union tgsi_exec_channel *src0,
   1351           const union tgsi_exec_channel *src1)
   1352 {
   1353    dst->f[0] = src0->f[0] + src1->f[0];
   1354    dst->f[1] = src0->f[1] + src1->f[1];
   1355    dst->f[2] = src0->f[2] + src1->f[2];
   1356    dst->f[3] = src0->f[3] + src1->f[3];
   1357 }
   1358 
   1359 static void
   1360 micro_div(
   1361    union tgsi_exec_channel *dst,
   1362    const union tgsi_exec_channel *src0,
   1363    const union tgsi_exec_channel *src1 )
   1364 {
   1365    if (src1->f[0] != 0) {
   1366       dst->f[0] = src0->f[0] / src1->f[0];
   1367    }
   1368    if (src1->f[1] != 0) {
   1369       dst->f[1] = src0->f[1] / src1->f[1];
   1370    }
   1371    if (src1->f[2] != 0) {
   1372       dst->f[2] = src0->f[2] / src1->f[2];
   1373    }
   1374    if (src1->f[3] != 0) {
   1375       dst->f[3] = src0->f[3] / src1->f[3];
   1376    }
   1377 }
   1378 
   1379 static void
   1380 micro_lt(
   1381    union tgsi_exec_channel *dst,
   1382    const union tgsi_exec_channel *src0,
   1383    const union tgsi_exec_channel *src1,
   1384    const union tgsi_exec_channel *src2,
   1385    const union tgsi_exec_channel *src3 )
   1386 {
   1387    dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
   1388    dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
   1389    dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
   1390    dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
   1391 }
   1392 
   1393 static void
   1394 micro_max(union tgsi_exec_channel *dst,
   1395           const union tgsi_exec_channel *src0,
   1396           const union tgsi_exec_channel *src1)
   1397 {
   1398    dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
   1399    dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
   1400    dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
   1401    dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
   1402 }
   1403 
   1404 static void
   1405 micro_min(union tgsi_exec_channel *dst,
   1406           const union tgsi_exec_channel *src0,
   1407           const union tgsi_exec_channel *src1)
   1408 {
   1409    dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
   1410    dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
   1411    dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
   1412    dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
   1413 }
   1414 
   1415 static void
   1416 micro_mul(union tgsi_exec_channel *dst,
   1417           const union tgsi_exec_channel *src0,
   1418           const union tgsi_exec_channel *src1)
   1419 {
   1420    dst->f[0] = src0->f[0] * src1->f[0];
   1421    dst->f[1] = src0->f[1] * src1->f[1];
   1422    dst->f[2] = src0->f[2] * src1->f[2];
   1423    dst->f[3] = src0->f[3] * src1->f[3];
   1424 }
   1425 
   1426 static void
   1427 micro_neg(
   1428    union tgsi_exec_channel *dst,
   1429    const union tgsi_exec_channel *src )
   1430 {
   1431    dst->f[0] = -src->f[0];
   1432    dst->f[1] = -src->f[1];
   1433    dst->f[2] = -src->f[2];
   1434    dst->f[3] = -src->f[3];
   1435 }
   1436 
   1437 static void
   1438 micro_pow(
   1439    union tgsi_exec_channel *dst,
   1440    const union tgsi_exec_channel *src0,
   1441    const union tgsi_exec_channel *src1 )
   1442 {
   1443 #if FAST_MATH
   1444    dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
   1445    dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
   1446    dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
   1447    dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
   1448 #else
   1449    dst->f[0] = powf( src0->f[0], src1->f[0] );
   1450    dst->f[1] = powf( src0->f[1], src1->f[1] );
   1451    dst->f[2] = powf( src0->f[2], src1->f[2] );
   1452    dst->f[3] = powf( src0->f[3], src1->f[3] );
   1453 #endif
   1454 }
   1455 
   1456 static void
   1457 micro_ldexp(union tgsi_exec_channel *dst,
   1458             const union tgsi_exec_channel *src0,
   1459             const union tgsi_exec_channel *src1)
   1460 {
   1461    dst->f[0] = ldexpf(src0->f[0], src1->i[0]);
   1462    dst->f[1] = ldexpf(src0->f[1], src1->i[1]);
   1463    dst->f[2] = ldexpf(src0->f[2], src1->i[2]);
   1464    dst->f[3] = ldexpf(src0->f[3], src1->i[3]);
   1465 }
   1466 
   1467 static void
   1468 micro_sub(union tgsi_exec_channel *dst,
   1469           const union tgsi_exec_channel *src0,
   1470           const union tgsi_exec_channel *src1)
   1471 {
   1472    dst->f[0] = src0->f[0] - src1->f[0];
   1473    dst->f[1] = src0->f[1] - src1->f[1];
   1474    dst->f[2] = src0->f[2] - src1->f[2];
   1475    dst->f[3] = src0->f[3] - src1->f[3];
   1476 }
   1477 
   1478 static void
   1479 fetch_src_file_channel(const struct tgsi_exec_machine *mach,
   1480                        const uint chan_index,
   1481                        const uint file,
   1482                        const uint swizzle,
   1483                        const union tgsi_exec_channel *index,
   1484                        const union tgsi_exec_channel *index2D,
   1485                        union tgsi_exec_channel *chan)
   1486 {
   1487    uint i;
   1488 
   1489    assert(swizzle < 4);
   1490 
   1491    switch (file) {
   1492    case TGSI_FILE_CONSTANT:
   1493       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1494          assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
   1495          assert(mach->Consts[index2D->i[i]]);
   1496 
   1497          if (index->i[i] < 0) {
   1498             chan->u[i] = 0;
   1499          } else {
   1500             /* NOTE: copying the const value as a uint instead of float */
   1501             const uint constbuf = index2D->i[i];
   1502             const uint *buf = (const uint *)mach->Consts[constbuf];
   1503             const int pos = index->i[i] * 4 + swizzle;
   1504             /* const buffer bounds check */
   1505             if (pos < 0 || pos >= (int) mach->ConstsSize[constbuf]) {
   1506                if (0) {
   1507                   /* Debug: print warning */
   1508                   static int count = 0;
   1509                   if (count++ < 100)
   1510                      debug_printf("TGSI Exec: const buffer index %d"
   1511                                   " out of bounds\n", pos);
   1512                }
   1513                chan->u[i] = 0;
   1514             }
   1515             else
   1516                chan->u[i] = buf[pos];
   1517          }
   1518       }
   1519       break;
   1520 
   1521    case TGSI_FILE_INPUT:
   1522       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1523          /*
   1524          if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
   1525             debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
   1526                          index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
   1527                          index2D->i[i], index->i[i]);
   1528                          }*/
   1529          int pos = index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i];
   1530          assert(pos >= 0);
   1531          assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
   1532          chan->u[i] = mach->Inputs[pos].xyzw[swizzle].u[i];
   1533       }
   1534       break;
   1535 
   1536    case TGSI_FILE_SYSTEM_VALUE:
   1537       /* XXX no swizzling at this point.  Will be needed if we put
   1538        * gl_FragCoord, for example, in a sys value register.
   1539        */
   1540       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1541          chan->u[i] = mach->SystemValue[index->i[i]].xyzw[swizzle].u[i];
   1542       }
   1543       break;
   1544 
   1545    case TGSI_FILE_TEMPORARY:
   1546       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1547          assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
   1548          assert(index2D->i[i] == 0);
   1549 
   1550          chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
   1551       }
   1552       break;
   1553 
   1554    case TGSI_FILE_IMMEDIATE:
   1555       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1556          assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
   1557          assert(index2D->i[i] == 0);
   1558 
   1559          chan->f[i] = mach->Imms[index->i[i]][swizzle];
   1560       }
   1561       break;
   1562 
   1563    case TGSI_FILE_ADDRESS:
   1564       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1565          assert(index->i[i] >= 0);
   1566          assert(index2D->i[i] == 0);
   1567 
   1568          chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
   1569       }
   1570       break;
   1571 
   1572    case TGSI_FILE_OUTPUT:
   1573       /* vertex/fragment output vars can be read too */
   1574       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1575          assert(index->i[i] >= 0);
   1576          assert(index2D->i[i] == 0);
   1577 
   1578          chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
   1579       }
   1580       break;
   1581 
   1582    default:
   1583       assert(0);
   1584       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1585          chan->u[i] = 0;
   1586       }
   1587    }
   1588 }
   1589 
   1590 static void
   1591 fetch_source_d(const struct tgsi_exec_machine *mach,
   1592                union tgsi_exec_channel *chan,
   1593                const struct tgsi_full_src_register *reg,
   1594                const uint chan_index,
   1595                enum tgsi_exec_datatype src_datatype)
   1596 {
   1597    union tgsi_exec_channel index;
   1598    union tgsi_exec_channel index2D;
   1599    uint swizzle;
   1600 
   1601    /* We start with a direct index into a register file.
   1602     *
   1603     *    file[1],
   1604     *    where:
   1605     *       file = Register.File
   1606     *       [1] = Register.Index
   1607     */
   1608    index.i[0] =
   1609    index.i[1] =
   1610    index.i[2] =
   1611    index.i[3] = reg->Register.Index;
   1612 
   1613    /* There is an extra source register that indirectly subscripts
   1614     * a register file. The direct index now becomes an offset
   1615     * that is being added to the indirect register.
   1616     *
   1617     *    file[ind[2].x+1],
   1618     *    where:
   1619     *       ind = Indirect.File
   1620     *       [2] = Indirect.Index
   1621     *       .x = Indirect.SwizzleX
   1622     */
   1623    if (reg->Register.Indirect) {
   1624       union tgsi_exec_channel index2;
   1625       union tgsi_exec_channel indir_index;
   1626       const uint execmask = mach->ExecMask;
   1627       uint i;
   1628 
   1629       /* which address register (always zero now) */
   1630       index2.i[0] =
   1631       index2.i[1] =
   1632       index2.i[2] =
   1633       index2.i[3] = reg->Indirect.Index;
   1634       /* get current value of address register[swizzle] */
   1635       swizzle = reg->Indirect.Swizzle;
   1636       fetch_src_file_channel(mach,
   1637                              chan_index,
   1638                              reg->Indirect.File,
   1639                              swizzle,
   1640                              &index2,
   1641                              &ZeroVec,
   1642                              &indir_index);
   1643 
   1644       /* add value of address register to the offset */
   1645       index.i[0] += indir_index.i[0];
   1646       index.i[1] += indir_index.i[1];
   1647       index.i[2] += indir_index.i[2];
   1648       index.i[3] += indir_index.i[3];
   1649 
   1650       /* for disabled execution channels, zero-out the index to
   1651        * avoid using a potential garbage value.
   1652        */
   1653       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1654          if ((execmask & (1 << i)) == 0)
   1655             index.i[i] = 0;
   1656       }
   1657    }
   1658 
   1659    /* There is an extra source register that is a second
   1660     * subscript to a register file. Effectively it means that
   1661     * the register file is actually a 2D array of registers.
   1662     *
   1663     *    file[3][1],
   1664     *    where:
   1665     *       [3] = Dimension.Index
   1666     */
   1667    if (reg->Register.Dimension) {
   1668       index2D.i[0] =
   1669       index2D.i[1] =
   1670       index2D.i[2] =
   1671       index2D.i[3] = reg->Dimension.Index;
   1672 
   1673       /* Again, the second subscript index can be addressed indirectly
   1674        * identically to the first one.
   1675        * Nothing stops us from indirectly addressing the indirect register,
   1676        * but there is no need for that, so we won't exercise it.
   1677        *
   1678        *    file[ind[4].y+3][1],
   1679        *    where:
   1680        *       ind = DimIndirect.File
   1681        *       [4] = DimIndirect.Index
   1682        *       .y = DimIndirect.SwizzleX
   1683        */
   1684       if (reg->Dimension.Indirect) {
   1685          union tgsi_exec_channel index2;
   1686          union tgsi_exec_channel indir_index;
   1687          const uint execmask = mach->ExecMask;
   1688          uint i;
   1689 
   1690          index2.i[0] =
   1691          index2.i[1] =
   1692          index2.i[2] =
   1693          index2.i[3] = reg->DimIndirect.Index;
   1694 
   1695          swizzle = reg->DimIndirect.Swizzle;
   1696          fetch_src_file_channel(mach,
   1697                                 chan_index,
   1698                                 reg->DimIndirect.File,
   1699                                 swizzle,
   1700                                 &index2,
   1701                                 &ZeroVec,
   1702                                 &indir_index);
   1703 
   1704          index2D.i[0] += indir_index.i[0];
   1705          index2D.i[1] += indir_index.i[1];
   1706          index2D.i[2] += indir_index.i[2];
   1707          index2D.i[3] += indir_index.i[3];
   1708 
   1709          /* for disabled execution channels, zero-out the index to
   1710           * avoid using a potential garbage value.
   1711           */
   1712          for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1713             if ((execmask & (1 << i)) == 0) {
   1714                index2D.i[i] = 0;
   1715             }
   1716          }
   1717       }
   1718 
   1719       /* If by any chance there was a need for a 3D array of register
   1720        * files, we would have to check whether Dimension is followed
   1721        * by a dimension register and continue the saga.
   1722        */
   1723    } else {
   1724       index2D.i[0] =
   1725       index2D.i[1] =
   1726       index2D.i[2] =
   1727       index2D.i[3] = 0;
   1728    }
   1729 
   1730    swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
   1731    fetch_src_file_channel(mach,
   1732                           chan_index,
   1733                           reg->Register.File,
   1734                           swizzle,
   1735                           &index,
   1736                           &index2D,
   1737                           chan);
   1738 }
   1739 
   1740 static void
   1741 fetch_source(const struct tgsi_exec_machine *mach,
   1742              union tgsi_exec_channel *chan,
   1743              const struct tgsi_full_src_register *reg,
   1744              const uint chan_index,
   1745              enum tgsi_exec_datatype src_datatype)
   1746 {
   1747    fetch_source_d(mach, chan, reg, chan_index, src_datatype);
   1748 
   1749    if (reg->Register.Absolute) {
   1750       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
   1751          micro_abs(chan, chan);
   1752       } else {
   1753          micro_iabs(chan, chan);
   1754       }
   1755    }
   1756 
   1757    if (reg->Register.Negate) {
   1758       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
   1759          micro_neg(chan, chan);
   1760       } else {
   1761          micro_ineg(chan, chan);
   1762       }
   1763    }
   1764 }
   1765 
   1766 static union tgsi_exec_channel *
   1767 store_dest_dstret(struct tgsi_exec_machine *mach,
   1768                  const union tgsi_exec_channel *chan,
   1769                  const struct tgsi_full_dst_register *reg,
   1770                  const struct tgsi_full_instruction *inst,
   1771                  uint chan_index,
   1772                  enum tgsi_exec_datatype dst_datatype)
   1773 {
   1774    static union tgsi_exec_channel null;
   1775    union tgsi_exec_channel *dst;
   1776    union tgsi_exec_channel index2D;
   1777    int offset = 0;  /* indirection offset */
   1778    int index;
   1779 
   1780    /* for debugging */
   1781    if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
   1782       check_inf_or_nan(chan);
   1783    }
   1784 
   1785    /* There is an extra source register that indirectly subscripts
   1786     * a register file. The direct index now becomes an offset
   1787     * that is being added to the indirect register.
   1788     *
   1789     *    file[ind[2].x+1],
   1790     *    where:
   1791     *       ind = Indirect.File
   1792     *       [2] = Indirect.Index
   1793     *       .x = Indirect.SwizzleX
   1794     */
   1795    if (reg->Register.Indirect) {
   1796       union tgsi_exec_channel index;
   1797       union tgsi_exec_channel indir_index;
   1798       uint swizzle;
   1799 
   1800       /* which address register (always zero for now) */
   1801       index.i[0] =
   1802       index.i[1] =
   1803       index.i[2] =
   1804       index.i[3] = reg->Indirect.Index;
   1805 
   1806       /* get current value of address register[swizzle] */
   1807       swizzle = reg->Indirect.Swizzle;
   1808 
   1809       /* fetch values from the address/indirection register */
   1810       fetch_src_file_channel(mach,
   1811                              chan_index,
   1812                              reg->Indirect.File,
   1813                              swizzle,
   1814                              &index,
   1815                              &ZeroVec,
   1816                              &indir_index);
   1817 
   1818       /* save indirection offset */
   1819       offset = indir_index.i[0];
   1820    }
   1821 
   1822    /* There is an extra source register that is a second
   1823     * subscript to a register file. Effectively it means that
   1824     * the register file is actually a 2D array of registers.
   1825     *
   1826     *    file[3][1],
   1827     *    where:
   1828     *       [3] = Dimension.Index
   1829     */
   1830    if (reg->Register.Dimension) {
   1831       index2D.i[0] =
   1832       index2D.i[1] =
   1833       index2D.i[2] =
   1834       index2D.i[3] = reg->Dimension.Index;
   1835 
   1836       /* Again, the second subscript index can be addressed indirectly
   1837        * identically to the first one.
   1838        * Nothing stops us from indirectly addressing the indirect register,
   1839        * but there is no need for that, so we won't exercise it.
   1840        *
   1841        *    file[ind[4].y+3][1],
   1842        *    where:
   1843        *       ind = DimIndirect.File
   1844        *       [4] = DimIndirect.Index
   1845        *       .y = DimIndirect.SwizzleX
   1846        */
   1847       if (reg->Dimension.Indirect) {
   1848          union tgsi_exec_channel index2;
   1849          union tgsi_exec_channel indir_index;
   1850          const uint execmask = mach->ExecMask;
   1851          unsigned swizzle;
   1852          uint i;
   1853 
   1854          index2.i[0] =
   1855          index2.i[1] =
   1856          index2.i[2] =
   1857          index2.i[3] = reg->DimIndirect.Index;
   1858 
   1859          swizzle = reg->DimIndirect.Swizzle;
   1860          fetch_src_file_channel(mach,
   1861                                 chan_index,
   1862                                 reg->DimIndirect.File,
   1863                                 swizzle,
   1864                                 &index2,
   1865                                 &ZeroVec,
   1866                                 &indir_index);
   1867 
   1868          index2D.i[0] += indir_index.i[0];
   1869          index2D.i[1] += indir_index.i[1];
   1870          index2D.i[2] += indir_index.i[2];
   1871          index2D.i[3] += indir_index.i[3];
   1872 
   1873          /* for disabled execution channels, zero-out the index to
   1874           * avoid using a potential garbage value.
   1875           */
   1876          for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   1877             if ((execmask & (1 << i)) == 0) {
   1878                index2D.i[i] = 0;
   1879             }
   1880          }
   1881       }
   1882 
   1883       /* If by any chance there was a need for a 3D array of register
   1884        * files, we would have to check whether Dimension is followed
   1885        * by a dimension register and continue the saga.
   1886        */
   1887    } else {
   1888       index2D.i[0] =
   1889       index2D.i[1] =
   1890       index2D.i[2] =
   1891       index2D.i[3] = 0;
   1892    }
   1893 
   1894    switch (reg->Register.File) {
   1895    case TGSI_FILE_NULL:
   1896       dst = &null;
   1897       break;
   1898 
   1899    case TGSI_FILE_OUTPUT:
   1900       index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
   1901          + reg->Register.Index;
   1902       dst = &mach->Outputs[offset + index].xyzw[chan_index];
   1903 #if 0
   1904       debug_printf("NumOutputs = %d, TEMP_O_C/I = %d, redindex = %d\n",
   1905                    mach->NumOutputs, mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0],
   1906                    reg->Register.Index);
   1907       if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
   1908          debug_printf("STORING OUT[%d] mask(%d), = (", offset + index, execmask);
   1909          for (i = 0; i < TGSI_QUAD_SIZE; i++)
   1910             if (execmask & (1 << i))
   1911                debug_printf("%f, ", chan->f[i]);
   1912          debug_printf(")\n");
   1913       }
   1914 #endif
   1915       break;
   1916 
   1917    case TGSI_FILE_TEMPORARY:
   1918       index = reg->Register.Index;
   1919       assert( index < TGSI_EXEC_NUM_TEMPS );
   1920       dst = &mach->Temps[offset + index].xyzw[chan_index];
   1921       break;
   1922 
   1923    case TGSI_FILE_ADDRESS:
   1924       index = reg->Register.Index;
   1925       dst = &mach->Addrs[index].xyzw[chan_index];
   1926       break;
   1927 
   1928    default:
   1929       assert( 0 );
   1930       return NULL;
   1931    }
   1932 
   1933    return dst;
   1934 }
   1935 
   1936 static void
   1937 store_dest_double(struct tgsi_exec_machine *mach,
   1938                  const union tgsi_exec_channel *chan,
   1939                  const struct tgsi_full_dst_register *reg,
   1940                  const struct tgsi_full_instruction *inst,
   1941                  uint chan_index,
   1942                  enum tgsi_exec_datatype dst_datatype)
   1943 {
   1944    union tgsi_exec_channel *dst;
   1945    const uint execmask = mach->ExecMask;
   1946    int i;
   1947 
   1948    dst = store_dest_dstret(mach, chan, reg, inst, chan_index,
   1949 			   dst_datatype);
   1950    if (!dst)
   1951       return;
   1952 
   1953    /* doubles path */
   1954    for (i = 0; i < TGSI_QUAD_SIZE; i++)
   1955       if (execmask & (1 << i))
   1956          dst->i[i] = chan->i[i];
   1957 }
   1958 
   1959 static void
   1960 store_dest(struct tgsi_exec_machine *mach,
   1961            const union tgsi_exec_channel *chan,
   1962            const struct tgsi_full_dst_register *reg,
   1963            const struct tgsi_full_instruction *inst,
   1964            uint chan_index,
   1965            enum tgsi_exec_datatype dst_datatype)
   1966 {
   1967    union tgsi_exec_channel *dst;
   1968    const uint execmask = mach->ExecMask;
   1969    int i;
   1970 
   1971    dst = store_dest_dstret(mach, chan, reg, inst, chan_index,
   1972                     dst_datatype);
   1973    if (!dst)
   1974       return;
   1975 
   1976    if (!inst->Instruction.Saturate) {
   1977       for (i = 0; i < TGSI_QUAD_SIZE; i++)
   1978          if (execmask & (1 << i))
   1979             dst->i[i] = chan->i[i];
   1980    }
   1981    else {
   1982       for (i = 0; i < TGSI_QUAD_SIZE; i++)
   1983          if (execmask & (1 << i)) {
   1984             if (chan->f[i] < 0.0f)
   1985                dst->f[i] = 0.0f;
   1986             else if (chan->f[i] > 1.0f)
   1987                dst->f[i] = 1.0f;
   1988             else
   1989                dst->i[i] = chan->i[i];
   1990          }
   1991    }
   1992 }
   1993 
   1994 #define FETCH(VAL,INDEX,CHAN)\
   1995     fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
   1996 
   1997 #define IFETCH(VAL,INDEX,CHAN)\
   1998     fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_INT)
   1999 
   2000 
   2001 /**
   2002  * Execute ARB-style KIL which is predicated by a src register.
   2003  * Kill fragment if any of the four values is less than zero.
   2004  */
   2005 static void
   2006 exec_kill_if(struct tgsi_exec_machine *mach,
   2007              const struct tgsi_full_instruction *inst)
   2008 {
   2009    uint uniquemask;
   2010    uint chan_index;
   2011    uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
   2012    union tgsi_exec_channel r[1];
   2013 
   2014    /* This mask stores component bits that were already tested. */
   2015    uniquemask = 0;
   2016 
   2017    for (chan_index = 0; chan_index < 4; chan_index++)
   2018    {
   2019       uint swizzle;
   2020       uint i;
   2021 
   2022       /* unswizzle channel */
   2023       swizzle = tgsi_util_get_full_src_register_swizzle (
   2024                         &inst->Src[0],
   2025                         chan_index);
   2026 
   2027       /* check if the component has not been already tested */
   2028       if (uniquemask & (1 << swizzle))
   2029          continue;
   2030       uniquemask |= 1 << swizzle;
   2031 
   2032       FETCH(&r[0], 0, chan_index);
   2033       for (i = 0; i < 4; i++)
   2034          if (r[0].f[i] < 0.0f)
   2035             kilmask |= 1 << i;
   2036    }
   2037 
   2038    /* restrict to fragments currently executing */
   2039    kilmask &= mach->ExecMask;
   2040 
   2041    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
   2042 }
   2043 
   2044 /**
   2045  * Unconditional fragment kill/discard.
   2046  */
   2047 static void
   2048 exec_kill(struct tgsi_exec_machine *mach,
   2049           const struct tgsi_full_instruction *inst)
   2050 {
   2051    uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
   2052 
   2053    /* kill fragment for all fragments currently executing */
   2054    kilmask = mach->ExecMask;
   2055    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
   2056 }
   2057 
   2058 static void
   2059 emit_vertex(struct tgsi_exec_machine *mach)
   2060 {
   2061    /* FIXME: check for exec mask correctly
   2062    unsigned i;
   2063    for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
   2064          if ((mach->ExecMask & (1 << i)))
   2065    */
   2066    if (mach->ExecMask) {
   2067       if (mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] >= mach->MaxOutputVertices)
   2068          return;
   2069 
   2070       mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
   2071       mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
   2072    }
   2073 }
   2074 
   2075 static void
   2076 emit_primitive(struct tgsi_exec_machine *mach)
   2077 {
   2078    unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
   2079    /* FIXME: check for exec mask correctly
   2080    unsigned i;
   2081    for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
   2082          if ((mach->ExecMask & (1 << i)))
   2083    */
   2084    if (mach->ExecMask) {
   2085       ++(*prim_count);
   2086       debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
   2087       mach->Primitives[*prim_count] = 0;
   2088    }
   2089 }
   2090 
   2091 static void
   2092 conditional_emit_primitive(struct tgsi_exec_machine *mach)
   2093 {
   2094    if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
   2095       int emitted_verts =
   2096          mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]];
   2097       if (emitted_verts) {
   2098          emit_primitive(mach);
   2099       }
   2100    }
   2101 }
   2102 
   2103 
   2104 /*
   2105  * Fetch four texture samples using STR texture coordinates.
   2106  */
   2107 static void
   2108 fetch_texel( struct tgsi_sampler *sampler,
   2109              const unsigned sview_idx,
   2110              const unsigned sampler_idx,
   2111              const union tgsi_exec_channel *s,
   2112              const union tgsi_exec_channel *t,
   2113              const union tgsi_exec_channel *p,
   2114              const union tgsi_exec_channel *c0,
   2115              const union tgsi_exec_channel *c1,
   2116              float derivs[3][2][TGSI_QUAD_SIZE],
   2117              const int8_t offset[3],
   2118              enum tgsi_sampler_control control,
   2119              union tgsi_exec_channel *r,
   2120              union tgsi_exec_channel *g,
   2121              union tgsi_exec_channel *b,
   2122              union tgsi_exec_channel *a )
   2123 {
   2124    uint j;
   2125    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
   2126 
   2127    /* FIXME: handle explicit derivs, offsets */
   2128    sampler->get_samples(sampler, sview_idx, sampler_idx,
   2129                         s->f, t->f, p->f, c0->f, c1->f, derivs, offset, control, rgba);
   2130 
   2131    for (j = 0; j < 4; j++) {
   2132       r->f[j] = rgba[0][j];
   2133       g->f[j] = rgba[1][j];
   2134       b->f[j] = rgba[2][j];
   2135       a->f[j] = rgba[3][j];
   2136    }
   2137 }
   2138 
   2139 
   2140 #define TEX_MODIFIER_NONE           0
   2141 #define TEX_MODIFIER_PROJECTED      1
   2142 #define TEX_MODIFIER_LOD_BIAS       2
   2143 #define TEX_MODIFIER_EXPLICIT_LOD   3
   2144 #define TEX_MODIFIER_LEVEL_ZERO     4
   2145 #define TEX_MODIFIER_GATHER         5
   2146 
   2147 /*
   2148  * Fetch all 3 (for s,t,r coords) texel offsets, put them into int array.
   2149  */
   2150 static void
   2151 fetch_texel_offsets(struct tgsi_exec_machine *mach,
   2152                     const struct tgsi_full_instruction *inst,
   2153                     int8_t offsets[3])
   2154 {
   2155    if (inst->Texture.NumOffsets == 1) {
   2156       union tgsi_exec_channel index;
   2157       union tgsi_exec_channel offset[3];
   2158       index.i[0] = index.i[1] = index.i[2] = index.i[3] = inst->TexOffsets[0].Index;
   2159       fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
   2160                              inst->TexOffsets[0].SwizzleX, &index, &ZeroVec, &offset[0]);
   2161       fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
   2162                              inst->TexOffsets[0].SwizzleY, &index, &ZeroVec, &offset[1]);
   2163       fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
   2164                              inst->TexOffsets[0].SwizzleZ, &index, &ZeroVec, &offset[2]);
   2165      offsets[0] = offset[0].i[0];
   2166      offsets[1] = offset[1].i[0];
   2167      offsets[2] = offset[2].i[0];
   2168    } else {
   2169      assert(inst->Texture.NumOffsets == 0);
   2170      offsets[0] = offsets[1] = offsets[2] = 0;
   2171    }
   2172 }
   2173 
   2174 
   2175 /*
   2176  * Fetch dx and dy values for one channel (s, t or r).
   2177  * Put dx values into one float array, dy values into another.
   2178  */
   2179 static void
   2180 fetch_assign_deriv_channel(struct tgsi_exec_machine *mach,
   2181                            const struct tgsi_full_instruction *inst,
   2182                            unsigned regdsrcx,
   2183                            unsigned chan,
   2184                            float derivs[2][TGSI_QUAD_SIZE])
   2185 {
   2186    union tgsi_exec_channel d;
   2187    FETCH(&d, regdsrcx, chan);
   2188    derivs[0][0] = d.f[0];
   2189    derivs[0][1] = d.f[1];
   2190    derivs[0][2] = d.f[2];
   2191    derivs[0][3] = d.f[3];
   2192    FETCH(&d, regdsrcx + 1, chan);
   2193    derivs[1][0] = d.f[0];
   2194    derivs[1][1] = d.f[1];
   2195    derivs[1][2] = d.f[2];
   2196    derivs[1][3] = d.f[3];
   2197 }
   2198 
   2199 static uint
   2200 fetch_sampler_unit(struct tgsi_exec_machine *mach,
   2201                    const struct tgsi_full_instruction *inst,
   2202                    uint sampler)
   2203 {
   2204    uint unit = 0;
   2205    int i;
   2206    if (inst->Src[sampler].Register.Indirect) {
   2207       const struct tgsi_full_src_register *reg = &inst->Src[sampler];
   2208       union tgsi_exec_channel indir_index, index2;
   2209       const uint execmask = mach->ExecMask;
   2210       index2.i[0] =
   2211       index2.i[1] =
   2212       index2.i[2] =
   2213       index2.i[3] = reg->Indirect.Index;
   2214 
   2215       fetch_src_file_channel(mach,
   2216                              0,
   2217                              reg->Indirect.File,
   2218                              reg->Indirect.Swizzle,
   2219                              &index2,
   2220                              &ZeroVec,
   2221                              &indir_index);
   2222       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   2223          if (execmask & (1 << i)) {
   2224             unit = inst->Src[sampler].Register.Index + indir_index.i[i];
   2225             break;
   2226          }
   2227       }
   2228 
   2229    } else {
   2230       unit = inst->Src[sampler].Register.Index;
   2231    }
   2232    return unit;
   2233 }
   2234 
   2235 /*
   2236  * execute a texture instruction.
   2237  *
   2238  * modifier is used to control the channel routing for the
   2239  * instruction variants like proj, lod, and texture with lod bias.
   2240  * sampler indicates which src register the sampler is contained in.
   2241  */
   2242 static void
   2243 exec_tex(struct tgsi_exec_machine *mach,
   2244          const struct tgsi_full_instruction *inst,
   2245          uint modifier, uint sampler)
   2246 {
   2247    const union tgsi_exec_channel *args[5], *proj = NULL;
   2248    union tgsi_exec_channel r[5];
   2249    enum tgsi_sampler_control control = TGSI_SAMPLER_LOD_NONE;
   2250    uint chan;
   2251    uint unit;
   2252    int8_t offsets[3];
   2253    int dim, shadow_ref, i;
   2254 
   2255    unit = fetch_sampler_unit(mach, inst, sampler);
   2256    /* always fetch all 3 offsets, overkill but keeps code simple */
   2257    fetch_texel_offsets(mach, inst, offsets);
   2258 
   2259    assert(modifier != TEX_MODIFIER_LEVEL_ZERO);
   2260    assert(inst->Texture.Texture != TGSI_TEXTURE_BUFFER);
   2261 
   2262    dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
   2263    shadow_ref = tgsi_util_get_shadow_ref_src_index(inst->Texture.Texture);
   2264 
   2265    assert(dim <= 4);
   2266    if (shadow_ref >= 0)
   2267       assert(shadow_ref >= dim && shadow_ref < ARRAY_SIZE(args));
   2268 
   2269    /* fetch modifier to the last argument */
   2270    if (modifier != TEX_MODIFIER_NONE) {
   2271       const int last = ARRAY_SIZE(args) - 1;
   2272 
   2273       /* fetch modifier from src0.w or src1.x */
   2274       if (sampler == 1) {
   2275          assert(dim <= TGSI_CHAN_W && shadow_ref != TGSI_CHAN_W);
   2276          FETCH(&r[last], 0, TGSI_CHAN_W);
   2277       }
   2278       else {
   2279          assert(shadow_ref != 4);
   2280          FETCH(&r[last], 1, TGSI_CHAN_X);
   2281       }
   2282 
   2283       if (modifier != TEX_MODIFIER_PROJECTED) {
   2284          args[last] = &r[last];
   2285       }
   2286       else {
   2287          proj = &r[last];
   2288          args[last] = &ZeroVec;
   2289       }
   2290 
   2291       /* point unused arguments to zero vector */
   2292       for (i = dim; i < last; i++)
   2293          args[i] = &ZeroVec;
   2294 
   2295       if (modifier == TEX_MODIFIER_EXPLICIT_LOD)
   2296          control = TGSI_SAMPLER_LOD_EXPLICIT;
   2297       else if (modifier == TEX_MODIFIER_LOD_BIAS)
   2298          control = TGSI_SAMPLER_LOD_BIAS;
   2299       else if (modifier == TEX_MODIFIER_GATHER)
   2300          control = TGSI_SAMPLER_GATHER;
   2301    }
   2302    else {
   2303       for (i = dim; i < ARRAY_SIZE(args); i++)
   2304          args[i] = &ZeroVec;
   2305    }
   2306 
   2307    /* fetch coordinates */
   2308    for (i = 0; i < dim; i++) {
   2309       FETCH(&r[i], 0, TGSI_CHAN_X + i);
   2310 
   2311       if (proj)
   2312          micro_div(&r[i], &r[i], proj);
   2313 
   2314       args[i] = &r[i];
   2315    }
   2316 
   2317    /* fetch reference value */
   2318    if (shadow_ref >= 0) {
   2319       FETCH(&r[shadow_ref], shadow_ref / 4, TGSI_CHAN_X + (shadow_ref % 4));
   2320 
   2321       if (proj)
   2322          micro_div(&r[shadow_ref], &r[shadow_ref], proj);
   2323 
   2324       args[shadow_ref] = &r[shadow_ref];
   2325    }
   2326 
   2327    fetch_texel(mach->Sampler, unit, unit,
   2328          args[0], args[1], args[2], args[3], args[4],
   2329          NULL, offsets, control,
   2330          &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
   2331 
   2332 #if 0
   2333    debug_printf("fetch r: %g %g %g %g\n",
   2334          r[0].f[0], r[0].f[1], r[0].f[2], r[0].f[3]);
   2335    debug_printf("fetch g: %g %g %g %g\n",
   2336          r[1].f[0], r[1].f[1], r[1].f[2], r[1].f[3]);
   2337    debug_printf("fetch b: %g %g %g %g\n",
   2338          r[2].f[0], r[2].f[1], r[2].f[2], r[2].f[3]);
   2339    debug_printf("fetch a: %g %g %g %g\n",
   2340          r[3].f[0], r[3].f[1], r[3].f[2], r[3].f[3]);
   2341 #endif
   2342 
   2343    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2344       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2345          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   2346       }
   2347    }
   2348 }
   2349 
   2350 static void
   2351 exec_lodq(struct tgsi_exec_machine *mach,
   2352           const struct tgsi_full_instruction *inst)
   2353 {
   2354    uint resource_unit, sampler_unit;
   2355    int dim;
   2356    int i;
   2357    union tgsi_exec_channel coords[4];
   2358    const union tgsi_exec_channel *args[ARRAY_SIZE(coords)];
   2359    union tgsi_exec_channel r[2];
   2360 
   2361    resource_unit = fetch_sampler_unit(mach, inst, 1);
   2362    if (inst->Instruction.Opcode == TGSI_OPCODE_LOD) {
   2363       uint target = mach->SamplerViews[resource_unit].Resource;
   2364       dim = tgsi_util_get_texture_coord_dim(target);
   2365       sampler_unit = fetch_sampler_unit(mach, inst, 2);
   2366    } else {
   2367       dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
   2368       sampler_unit = resource_unit;
   2369    }
   2370    assert(dim <= ARRAY_SIZE(coords));
   2371    /* fetch coordinates */
   2372    for (i = 0; i < dim; i++) {
   2373       FETCH(&coords[i], 0, TGSI_CHAN_X + i);
   2374       args[i] = &coords[i];
   2375    }
   2376    for (i = dim; i < ARRAY_SIZE(coords); i++) {
   2377       args[i] = &ZeroVec;
   2378    }
   2379    mach->Sampler->query_lod(mach->Sampler, resource_unit, sampler_unit,
   2380                             args[0]->f,
   2381                             args[1]->f,
   2382                             args[2]->f,
   2383                             args[3]->f,
   2384                             TGSI_SAMPLER_LOD_NONE,
   2385                             r[0].f,
   2386                             r[1].f);
   2387 
   2388    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
   2389       store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X,
   2390                  TGSI_EXEC_DATA_FLOAT);
   2391    }
   2392    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
   2393       store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Y,
   2394                  TGSI_EXEC_DATA_FLOAT);
   2395    }
   2396    if (inst->Instruction.Opcode == TGSI_OPCODE_LOD) {
   2397       unsigned char swizzles[4];
   2398       unsigned chan;
   2399       swizzles[0] = inst->Src[1].Register.SwizzleX;
   2400       swizzles[1] = inst->Src[1].Register.SwizzleY;
   2401       swizzles[2] = inst->Src[1].Register.SwizzleZ;
   2402       swizzles[3] = inst->Src[1].Register.SwizzleW;
   2403 
   2404       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2405          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2406             if (swizzles[chan] >= 2) {
   2407                store_dest(mach, &ZeroVec,
   2408                           &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   2409             } else {
   2410                store_dest(mach, &r[swizzles[chan]],
   2411                           &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   2412             }
   2413          }
   2414       }
   2415    } else {
   2416       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
   2417          store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X,
   2418                     TGSI_EXEC_DATA_FLOAT);
   2419       }
   2420       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
   2421          store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Y,
   2422                     TGSI_EXEC_DATA_FLOAT);
   2423       }
   2424    }
   2425 }
   2426 
   2427 static void
   2428 exec_txd(struct tgsi_exec_machine *mach,
   2429          const struct tgsi_full_instruction *inst)
   2430 {
   2431    union tgsi_exec_channel r[4];
   2432    float derivs[3][2][TGSI_QUAD_SIZE];
   2433    uint chan;
   2434    uint unit;
   2435    int8_t offsets[3];
   2436 
   2437    unit = fetch_sampler_unit(mach, inst, 3);
   2438    /* always fetch all 3 offsets, overkill but keeps code simple */
   2439    fetch_texel_offsets(mach, inst, offsets);
   2440 
   2441    switch (inst->Texture.Texture) {
   2442    case TGSI_TEXTURE_1D:
   2443       FETCH(&r[0], 0, TGSI_CHAN_X);
   2444 
   2445       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
   2446 
   2447       fetch_texel(mach->Sampler, unit, unit,
   2448                   &r[0], &ZeroVec, &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
   2449                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
   2450                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
   2451       break;
   2452 
   2453    case TGSI_TEXTURE_SHADOW1D:
   2454    case TGSI_TEXTURE_1D_ARRAY:
   2455    case TGSI_TEXTURE_SHADOW1D_ARRAY:
   2456       /* SHADOW1D/1D_ARRAY would not need Y/Z respectively, but don't bother */
   2457       FETCH(&r[0], 0, TGSI_CHAN_X);
   2458       FETCH(&r[1], 0, TGSI_CHAN_Y);
   2459       FETCH(&r[2], 0, TGSI_CHAN_Z);
   2460 
   2461       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
   2462 
   2463       fetch_texel(mach->Sampler, unit, unit,
   2464                   &r[0], &r[1], &r[2], &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
   2465                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
   2466                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
   2467       break;
   2468 
   2469    case TGSI_TEXTURE_2D:
   2470    case TGSI_TEXTURE_RECT:
   2471       FETCH(&r[0], 0, TGSI_CHAN_X);
   2472       FETCH(&r[1], 0, TGSI_CHAN_Y);
   2473 
   2474       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
   2475       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
   2476 
   2477       fetch_texel(mach->Sampler, unit, unit,
   2478                   &r[0], &r[1], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
   2479                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
   2480                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
   2481       break;
   2482 
   2483 
   2484    case TGSI_TEXTURE_SHADOW2D:
   2485    case TGSI_TEXTURE_SHADOWRECT:
   2486    case TGSI_TEXTURE_2D_ARRAY:
   2487    case TGSI_TEXTURE_SHADOW2D_ARRAY:
   2488       /* only SHADOW2D_ARRAY actually needs W */
   2489       FETCH(&r[0], 0, TGSI_CHAN_X);
   2490       FETCH(&r[1], 0, TGSI_CHAN_Y);
   2491       FETCH(&r[2], 0, TGSI_CHAN_Z);
   2492       FETCH(&r[3], 0, TGSI_CHAN_W);
   2493 
   2494       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
   2495       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
   2496 
   2497       fetch_texel(mach->Sampler, unit, unit,
   2498                   &r[0], &r[1], &r[2], &r[3], &ZeroVec,   /* inputs */
   2499                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
   2500                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
   2501       break;
   2502 
   2503    case TGSI_TEXTURE_3D:
   2504    case TGSI_TEXTURE_CUBE:
   2505    case TGSI_TEXTURE_CUBE_ARRAY:
   2506    case TGSI_TEXTURE_SHADOWCUBE:
   2507       /* only TEXTURE_CUBE_ARRAY and TEXTURE_SHADOWCUBE actually need W */
   2508       FETCH(&r[0], 0, TGSI_CHAN_X);
   2509       FETCH(&r[1], 0, TGSI_CHAN_Y);
   2510       FETCH(&r[2], 0, TGSI_CHAN_Z);
   2511       FETCH(&r[3], 0, TGSI_CHAN_W);
   2512 
   2513       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
   2514       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
   2515       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Z, derivs[2]);
   2516 
   2517       fetch_texel(mach->Sampler, unit, unit,
   2518                   &r[0], &r[1], &r[2], &r[3], &ZeroVec,   /* inputs */
   2519                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
   2520                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
   2521       break;
   2522 
   2523    default:
   2524       assert(0);
   2525    }
   2526 
   2527    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2528       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2529          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   2530       }
   2531    }
   2532 }
   2533 
   2534 
   2535 static void
   2536 exec_txf(struct tgsi_exec_machine *mach,
   2537          const struct tgsi_full_instruction *inst)
   2538 {
   2539    union tgsi_exec_channel r[4];
   2540    uint chan;
   2541    uint unit;
   2542    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
   2543    int j;
   2544    int8_t offsets[3];
   2545    unsigned target;
   2546 
   2547    unit = fetch_sampler_unit(mach, inst, 1);
   2548    /* always fetch all 3 offsets, overkill but keeps code simple */
   2549    fetch_texel_offsets(mach, inst, offsets);
   2550 
   2551    IFETCH(&r[3], 0, TGSI_CHAN_W);
   2552 
   2553    if (inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I ||
   2554        inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I_MS) {
   2555       target = mach->SamplerViews[unit].Resource;
   2556    }
   2557    else {
   2558       target = inst->Texture.Texture;
   2559    }
   2560    switch(target) {
   2561    case TGSI_TEXTURE_3D:
   2562    case TGSI_TEXTURE_2D_ARRAY:
   2563    case TGSI_TEXTURE_SHADOW2D_ARRAY:
   2564    case TGSI_TEXTURE_2D_ARRAY_MSAA:
   2565       IFETCH(&r[2], 0, TGSI_CHAN_Z);
   2566       /* fallthrough */
   2567    case TGSI_TEXTURE_2D:
   2568    case TGSI_TEXTURE_RECT:
   2569    case TGSI_TEXTURE_SHADOW1D_ARRAY:
   2570    case TGSI_TEXTURE_SHADOW2D:
   2571    case TGSI_TEXTURE_SHADOWRECT:
   2572    case TGSI_TEXTURE_1D_ARRAY:
   2573    case TGSI_TEXTURE_2D_MSAA:
   2574       IFETCH(&r[1], 0, TGSI_CHAN_Y);
   2575       /* fallthrough */
   2576    case TGSI_TEXTURE_BUFFER:
   2577    case TGSI_TEXTURE_1D:
   2578    case TGSI_TEXTURE_SHADOW1D:
   2579       IFETCH(&r[0], 0, TGSI_CHAN_X);
   2580       break;
   2581    default:
   2582       assert(0);
   2583       break;
   2584    }
   2585 
   2586    mach->Sampler->get_texel(mach->Sampler, unit, r[0].i, r[1].i, r[2].i, r[3].i,
   2587                             offsets, rgba);
   2588 
   2589    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
   2590       r[0].f[j] = rgba[0][j];
   2591       r[1].f[j] = rgba[1][j];
   2592       r[2].f[j] = rgba[2][j];
   2593       r[3].f[j] = rgba[3][j];
   2594    }
   2595 
   2596    if (inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I ||
   2597        inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I_MS) {
   2598       unsigned char swizzles[4];
   2599       swizzles[0] = inst->Src[1].Register.SwizzleX;
   2600       swizzles[1] = inst->Src[1].Register.SwizzleY;
   2601       swizzles[2] = inst->Src[1].Register.SwizzleZ;
   2602       swizzles[3] = inst->Src[1].Register.SwizzleW;
   2603 
   2604       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2605          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2606             store_dest(mach, &r[swizzles[chan]],
   2607                        &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   2608          }
   2609       }
   2610    }
   2611    else {
   2612       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2613          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2614             store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   2615          }
   2616       }
   2617    }
   2618 }
   2619 
   2620 static void
   2621 exec_txq(struct tgsi_exec_machine *mach,
   2622          const struct tgsi_full_instruction *inst)
   2623 {
   2624    int result[4];
   2625    union tgsi_exec_channel r[4], src;
   2626    uint chan;
   2627    uint unit;
   2628    int i,j;
   2629 
   2630    unit = fetch_sampler_unit(mach, inst, 1);
   2631 
   2632    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
   2633 
   2634    /* XXX: This interface can't return per-pixel values */
   2635    mach->Sampler->get_dims(mach->Sampler, unit, src.i[0], result);
   2636 
   2637    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   2638       for (j = 0; j < 4; j++) {
   2639          r[j].i[i] = result[j];
   2640       }
   2641    }
   2642 
   2643    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2644       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2645          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
   2646                     TGSI_EXEC_DATA_INT);
   2647       }
   2648    }
   2649 }
   2650 
   2651 static void
   2652 exec_sample(struct tgsi_exec_machine *mach,
   2653             const struct tgsi_full_instruction *inst,
   2654             uint modifier, boolean compare)
   2655 {
   2656    const uint resource_unit = inst->Src[1].Register.Index;
   2657    const uint sampler_unit = inst->Src[2].Register.Index;
   2658    union tgsi_exec_channel r[5], c1;
   2659    const union tgsi_exec_channel *lod = &ZeroVec;
   2660    enum tgsi_sampler_control control = TGSI_SAMPLER_LOD_NONE;
   2661    uint chan;
   2662    unsigned char swizzles[4];
   2663    int8_t offsets[3];
   2664 
   2665    /* always fetch all 3 offsets, overkill but keeps code simple */
   2666    fetch_texel_offsets(mach, inst, offsets);
   2667 
   2668    assert(modifier != TEX_MODIFIER_PROJECTED);
   2669 
   2670    if (modifier != TEX_MODIFIER_NONE) {
   2671       if (modifier == TEX_MODIFIER_LOD_BIAS) {
   2672          FETCH(&c1, 3, TGSI_CHAN_X);
   2673          lod = &c1;
   2674          control = TGSI_SAMPLER_LOD_BIAS;
   2675       }
   2676       else if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
   2677          FETCH(&c1, 3, TGSI_CHAN_X);
   2678          lod = &c1;
   2679          control = TGSI_SAMPLER_LOD_EXPLICIT;
   2680       }
   2681       else if (modifier == TEX_MODIFIER_GATHER) {
   2682          control = TGSI_SAMPLER_GATHER;
   2683       }
   2684       else {
   2685          assert(modifier == TEX_MODIFIER_LEVEL_ZERO);
   2686          control = TGSI_SAMPLER_LOD_ZERO;
   2687       }
   2688    }
   2689 
   2690    FETCH(&r[0], 0, TGSI_CHAN_X);
   2691 
   2692    switch (mach->SamplerViews[resource_unit].Resource) {
   2693    case TGSI_TEXTURE_1D:
   2694       if (compare) {
   2695          FETCH(&r[2], 3, TGSI_CHAN_X);
   2696          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
   2697                      &r[0], &ZeroVec, &r[2], &ZeroVec, lod, /* S, T, P, C, LOD */
   2698                      NULL, offsets, control,
   2699                      &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
   2700       }
   2701       else {
   2702          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
   2703                      &r[0], &ZeroVec, &ZeroVec, &ZeroVec, lod, /* S, T, P, C, LOD */
   2704                      NULL, offsets, control,
   2705                      &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
   2706       }
   2707       break;
   2708 
   2709    case TGSI_TEXTURE_1D_ARRAY:
   2710    case TGSI_TEXTURE_2D:
   2711    case TGSI_TEXTURE_RECT:
   2712       FETCH(&r[1], 0, TGSI_CHAN_Y);
   2713       if (compare) {
   2714          FETCH(&r[2], 3, TGSI_CHAN_X);
   2715          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
   2716                      &r[0], &r[1], &r[2], &ZeroVec, lod,    /* S, T, P, C, LOD */
   2717                      NULL, offsets, control,
   2718                      &r[0], &r[1], &r[2], &r[3]);  /* outputs */
   2719       }
   2720       else {
   2721          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
   2722                      &r[0], &r[1], &ZeroVec, &ZeroVec, lod,    /* S, T, P, C, LOD */
   2723                      NULL, offsets, control,
   2724                      &r[0], &r[1], &r[2], &r[3]);  /* outputs */
   2725       }
   2726       break;
   2727 
   2728    case TGSI_TEXTURE_2D_ARRAY:
   2729    case TGSI_TEXTURE_3D:
   2730    case TGSI_TEXTURE_CUBE:
   2731       FETCH(&r[1], 0, TGSI_CHAN_Y);
   2732       FETCH(&r[2], 0, TGSI_CHAN_Z);
   2733       if(compare) {
   2734          FETCH(&r[3], 3, TGSI_CHAN_X);
   2735          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
   2736                      &r[0], &r[1], &r[2], &r[3], lod,
   2737                      NULL, offsets, control,
   2738                      &r[0], &r[1], &r[2], &r[3]);
   2739       }
   2740       else {
   2741          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
   2742                      &r[0], &r[1], &r[2], &ZeroVec, lod,
   2743                      NULL, offsets, control,
   2744                      &r[0], &r[1], &r[2], &r[3]);
   2745       }
   2746       break;
   2747 
   2748    case TGSI_TEXTURE_CUBE_ARRAY:
   2749       FETCH(&r[1], 0, TGSI_CHAN_Y);
   2750       FETCH(&r[2], 0, TGSI_CHAN_Z);
   2751       FETCH(&r[3], 0, TGSI_CHAN_W);
   2752       if(compare) {
   2753          FETCH(&r[4], 3, TGSI_CHAN_X);
   2754          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
   2755                      &r[0], &r[1], &r[2], &r[3], &r[4],
   2756                      NULL, offsets, control,
   2757                      &r[0], &r[1], &r[2], &r[3]);
   2758       }
   2759       else {
   2760          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
   2761                      &r[0], &r[1], &r[2], &r[3], lod,
   2762                      NULL, offsets, control,
   2763                      &r[0], &r[1], &r[2], &r[3]);
   2764       }
   2765       break;
   2766 
   2767 
   2768    default:
   2769       assert(0);
   2770    }
   2771 
   2772    swizzles[0] = inst->Src[1].Register.SwizzleX;
   2773    swizzles[1] = inst->Src[1].Register.SwizzleY;
   2774    swizzles[2] = inst->Src[1].Register.SwizzleZ;
   2775    swizzles[3] = inst->Src[1].Register.SwizzleW;
   2776 
   2777    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2778       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2779          store_dest(mach, &r[swizzles[chan]],
   2780                     &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   2781       }
   2782    }
   2783 }
   2784 
   2785 static void
   2786 exec_sample_d(struct tgsi_exec_machine *mach,
   2787               const struct tgsi_full_instruction *inst)
   2788 {
   2789    const uint resource_unit = inst->Src[1].Register.Index;
   2790    const uint sampler_unit = inst->Src[2].Register.Index;
   2791    union tgsi_exec_channel r[4];
   2792    float derivs[3][2][TGSI_QUAD_SIZE];
   2793    uint chan;
   2794    unsigned char swizzles[4];
   2795    int8_t offsets[3];
   2796 
   2797    /* always fetch all 3 offsets, overkill but keeps code simple */
   2798    fetch_texel_offsets(mach, inst, offsets);
   2799 
   2800    FETCH(&r[0], 0, TGSI_CHAN_X);
   2801 
   2802    switch (mach->SamplerViews[resource_unit].Resource) {
   2803    case TGSI_TEXTURE_1D:
   2804    case TGSI_TEXTURE_1D_ARRAY:
   2805       /* only 1D array actually needs Y */
   2806       FETCH(&r[1], 0, TGSI_CHAN_Y);
   2807 
   2808       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
   2809 
   2810       fetch_texel(mach->Sampler, resource_unit, sampler_unit,
   2811                   &r[0], &r[1], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
   2812                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
   2813                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
   2814       break;
   2815 
   2816    case TGSI_TEXTURE_2D:
   2817    case TGSI_TEXTURE_RECT:
   2818    case TGSI_TEXTURE_2D_ARRAY:
   2819       /* only 2D array actually needs Z */
   2820       FETCH(&r[1], 0, TGSI_CHAN_Y);
   2821       FETCH(&r[2], 0, TGSI_CHAN_Z);
   2822 
   2823       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
   2824       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Y, derivs[1]);
   2825 
   2826       fetch_texel(mach->Sampler, resource_unit, sampler_unit,
   2827                   &r[0], &r[1], &r[2], &ZeroVec, &ZeroVec,   /* inputs */
   2828                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
   2829                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
   2830       break;
   2831 
   2832    case TGSI_TEXTURE_3D:
   2833    case TGSI_TEXTURE_CUBE:
   2834    case TGSI_TEXTURE_CUBE_ARRAY:
   2835       /* only cube array actually needs W */
   2836       FETCH(&r[1], 0, TGSI_CHAN_Y);
   2837       FETCH(&r[2], 0, TGSI_CHAN_Z);
   2838       FETCH(&r[3], 0, TGSI_CHAN_W);
   2839 
   2840       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
   2841       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Y, derivs[1]);
   2842       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Z, derivs[2]);
   2843 
   2844       fetch_texel(mach->Sampler, resource_unit, sampler_unit,
   2845                   &r[0], &r[1], &r[2], &r[3], &ZeroVec,
   2846                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
   2847                   &r[0], &r[1], &r[2], &r[3]);
   2848       break;
   2849 
   2850    default:
   2851       assert(0);
   2852    }
   2853 
   2854    swizzles[0] = inst->Src[1].Register.SwizzleX;
   2855    swizzles[1] = inst->Src[1].Register.SwizzleY;
   2856    swizzles[2] = inst->Src[1].Register.SwizzleZ;
   2857    swizzles[3] = inst->Src[1].Register.SwizzleW;
   2858 
   2859    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   2860       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   2861          store_dest(mach, &r[swizzles[chan]],
   2862                     &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   2863       }
   2864    }
   2865 }
   2866 
   2867 
   2868 /**
   2869  * Evaluate a constant-valued coefficient at the position of the
   2870  * current quad.
   2871  */
   2872 static void
   2873 eval_constant_coef(
   2874    struct tgsi_exec_machine *mach,
   2875    unsigned attrib,
   2876    unsigned chan )
   2877 {
   2878    unsigned i;
   2879 
   2880    for( i = 0; i < TGSI_QUAD_SIZE; i++ ) {
   2881       mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
   2882    }
   2883 }
   2884 
   2885 /**
   2886  * Evaluate a linear-valued coefficient at the position of the
   2887  * current quad.
   2888  */
   2889 static void
   2890 eval_linear_coef(
   2891    struct tgsi_exec_machine *mach,
   2892    unsigned attrib,
   2893    unsigned chan )
   2894 {
   2895    const float x = mach->QuadPos.xyzw[0].f[0];
   2896    const float y = mach->QuadPos.xyzw[1].f[0];
   2897    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
   2898    const float dady = mach->InterpCoefs[attrib].dady[chan];
   2899    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
   2900    mach->Inputs[attrib].xyzw[chan].f[0] = a0;
   2901    mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
   2902    mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
   2903    mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
   2904 }
   2905 
   2906 /**
   2907  * Evaluate a perspective-valued coefficient at the position of the
   2908  * current quad.
   2909  */
   2910 static void
   2911 eval_perspective_coef(
   2912    struct tgsi_exec_machine *mach,
   2913    unsigned attrib,
   2914    unsigned chan )
   2915 {
   2916    const float x = mach->QuadPos.xyzw[0].f[0];
   2917    const float y = mach->QuadPos.xyzw[1].f[0];
   2918    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
   2919    const float dady = mach->InterpCoefs[attrib].dady[chan];
   2920    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
   2921    const float *w = mach->QuadPos.xyzw[3].f;
   2922    /* divide by W here */
   2923    mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
   2924    mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
   2925    mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
   2926    mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
   2927 }
   2928 
   2929 
   2930 typedef void (* eval_coef_func)(
   2931    struct tgsi_exec_machine *mach,
   2932    unsigned attrib,
   2933    unsigned chan );
   2934 
   2935 static void
   2936 exec_declaration(struct tgsi_exec_machine *mach,
   2937                  const struct tgsi_full_declaration *decl)
   2938 {
   2939    if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
   2940       mach->SamplerViews[decl->Range.First] = decl->SamplerView;
   2941       return;
   2942    }
   2943 
   2944    if (mach->ShaderType == PIPE_SHADER_FRAGMENT) {
   2945       if (decl->Declaration.File == TGSI_FILE_INPUT) {
   2946          uint first, last, mask;
   2947 
   2948          first = decl->Range.First;
   2949          last = decl->Range.Last;
   2950          mask = decl->Declaration.UsageMask;
   2951 
   2952          /* XXX we could remove this special-case code since
   2953           * mach->InterpCoefs[first].a0 should already have the
   2954           * front/back-face value.  But we should first update the
   2955           * ureg code to emit the right UsageMask value (WRITEMASK_X).
   2956           * Then, we could remove the tgsi_exec_machine::Face field.
   2957           */
   2958          /* XXX make FACE a system value */
   2959          if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
   2960             uint i;
   2961 
   2962             assert(decl->Semantic.Index == 0);
   2963             assert(first == last);
   2964 
   2965             for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   2966                mach->Inputs[first].xyzw[0].f[i] = mach->Face;
   2967             }
   2968          } else {
   2969             eval_coef_func eval;
   2970             uint i, j;
   2971 
   2972             switch (decl->Interp.Interpolate) {
   2973             case TGSI_INTERPOLATE_CONSTANT:
   2974                eval = eval_constant_coef;
   2975                break;
   2976 
   2977             case TGSI_INTERPOLATE_LINEAR:
   2978                eval = eval_linear_coef;
   2979                break;
   2980 
   2981             case TGSI_INTERPOLATE_PERSPECTIVE:
   2982                eval = eval_perspective_coef;
   2983                break;
   2984 
   2985             case TGSI_INTERPOLATE_COLOR:
   2986                eval = mach->flatshade_color ? eval_constant_coef : eval_perspective_coef;
   2987                break;
   2988 
   2989             default:
   2990                assert(0);
   2991                return;
   2992             }
   2993 
   2994             for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
   2995                if (mask & (1 << j)) {
   2996                   for (i = first; i <= last; i++) {
   2997                      eval(mach, i, j);
   2998                   }
   2999                }
   3000             }
   3001          }
   3002 
   3003          if (DEBUG_EXECUTION) {
   3004             uint i, j;
   3005             for (i = first; i <= last; ++i) {
   3006                debug_printf("IN[%2u] = ", i);
   3007                for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
   3008                   if (j > 0) {
   3009                      debug_printf("         ");
   3010                   }
   3011                   debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
   3012                                mach->Inputs[i].xyzw[0].f[j], mach->Inputs[i].xyzw[0].u[j],
   3013                                mach->Inputs[i].xyzw[1].f[j], mach->Inputs[i].xyzw[1].u[j],
   3014                                mach->Inputs[i].xyzw[2].f[j], mach->Inputs[i].xyzw[2].u[j],
   3015                                mach->Inputs[i].xyzw[3].f[j], mach->Inputs[i].xyzw[3].u[j]);
   3016                }
   3017             }
   3018          }
   3019       }
   3020    }
   3021 
   3022 }
   3023 
   3024 typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
   3025                                 const union tgsi_exec_channel *src);
   3026 
   3027 static void
   3028 exec_scalar_unary(struct tgsi_exec_machine *mach,
   3029                   const struct tgsi_full_instruction *inst,
   3030                   micro_unary_op op,
   3031                   enum tgsi_exec_datatype dst_datatype,
   3032                   enum tgsi_exec_datatype src_datatype)
   3033 {
   3034    unsigned int chan;
   3035    union tgsi_exec_channel src;
   3036    union tgsi_exec_channel dst;
   3037 
   3038    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, src_datatype);
   3039    op(&dst, &src);
   3040    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3041       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3042          store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
   3043       }
   3044    }
   3045 }
   3046 
   3047 static void
   3048 exec_vector_unary(struct tgsi_exec_machine *mach,
   3049                   const struct tgsi_full_instruction *inst,
   3050                   micro_unary_op op,
   3051                   enum tgsi_exec_datatype dst_datatype,
   3052                   enum tgsi_exec_datatype src_datatype)
   3053 {
   3054    unsigned int chan;
   3055    struct tgsi_exec_vector dst;
   3056 
   3057    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3058       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3059          union tgsi_exec_channel src;
   3060 
   3061          fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
   3062          op(&dst.xyzw[chan], &src);
   3063       }
   3064    }
   3065    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3066       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3067          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
   3068       }
   3069    }
   3070 }
   3071 
   3072 typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
   3073                                  const union tgsi_exec_channel *src0,
   3074                                  const union tgsi_exec_channel *src1);
   3075 
   3076 static void
   3077 exec_scalar_binary(struct tgsi_exec_machine *mach,
   3078                    const struct tgsi_full_instruction *inst,
   3079                    micro_binary_op op,
   3080                    enum tgsi_exec_datatype dst_datatype,
   3081                    enum tgsi_exec_datatype src_datatype)
   3082 {
   3083    unsigned int chan;
   3084    union tgsi_exec_channel src[2];
   3085    union tgsi_exec_channel dst;
   3086 
   3087    fetch_source(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, src_datatype);
   3088    fetch_source(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, src_datatype);
   3089    op(&dst, &src[0], &src[1]);
   3090    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3091       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3092          store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
   3093       }
   3094    }
   3095 }
   3096 
   3097 static void
   3098 exec_vector_binary(struct tgsi_exec_machine *mach,
   3099                    const struct tgsi_full_instruction *inst,
   3100                    micro_binary_op op,
   3101                    enum tgsi_exec_datatype dst_datatype,
   3102                    enum tgsi_exec_datatype src_datatype)
   3103 {
   3104    unsigned int chan;
   3105    struct tgsi_exec_vector dst;
   3106 
   3107    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3108       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3109          union tgsi_exec_channel src[2];
   3110 
   3111          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
   3112          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
   3113          op(&dst.xyzw[chan], &src[0], &src[1]);
   3114       }
   3115    }
   3116    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3117       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3118          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
   3119       }
   3120    }
   3121 }
   3122 
   3123 typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
   3124                                   const union tgsi_exec_channel *src0,
   3125                                   const union tgsi_exec_channel *src1,
   3126                                   const union tgsi_exec_channel *src2);
   3127 
   3128 static void
   3129 exec_vector_trinary(struct tgsi_exec_machine *mach,
   3130                     const struct tgsi_full_instruction *inst,
   3131                     micro_trinary_op op,
   3132                     enum tgsi_exec_datatype dst_datatype,
   3133                     enum tgsi_exec_datatype src_datatype)
   3134 {
   3135    unsigned int chan;
   3136    struct tgsi_exec_vector dst;
   3137 
   3138    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3139       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3140          union tgsi_exec_channel src[3];
   3141 
   3142          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
   3143          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
   3144          fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
   3145          op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
   3146       }
   3147    }
   3148    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3149       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3150          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
   3151       }
   3152    }
   3153 }
   3154 
   3155 typedef void (* micro_quaternary_op)(union tgsi_exec_channel *dst,
   3156                                      const union tgsi_exec_channel *src0,
   3157                                      const union tgsi_exec_channel *src1,
   3158                                      const union tgsi_exec_channel *src2,
   3159                                      const union tgsi_exec_channel *src3);
   3160 
   3161 static void
   3162 exec_vector_quaternary(struct tgsi_exec_machine *mach,
   3163                        const struct tgsi_full_instruction *inst,
   3164                        micro_quaternary_op op,
   3165                        enum tgsi_exec_datatype dst_datatype,
   3166                        enum tgsi_exec_datatype src_datatype)
   3167 {
   3168    unsigned int chan;
   3169    struct tgsi_exec_vector dst;
   3170 
   3171    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3172       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3173          union tgsi_exec_channel src[4];
   3174 
   3175          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
   3176          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
   3177          fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
   3178          fetch_source(mach, &src[3], &inst->Src[3], chan, src_datatype);
   3179          op(&dst.xyzw[chan], &src[0], &src[1], &src[2], &src[3]);
   3180       }
   3181    }
   3182    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3183       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3184          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
   3185       }
   3186    }
   3187 }
   3188 
   3189 static void
   3190 exec_dp3(struct tgsi_exec_machine *mach,
   3191          const struct tgsi_full_instruction *inst)
   3192 {
   3193    unsigned int chan;
   3194    union tgsi_exec_channel arg[3];
   3195 
   3196    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3197    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3198    micro_mul(&arg[2], &arg[0], &arg[1]);
   3199 
   3200    for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_Z; chan++) {
   3201       fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
   3202       fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
   3203       micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
   3204    }
   3205 
   3206    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3207       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3208          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   3209       }
   3210    }
   3211 }
   3212 
   3213 static void
   3214 exec_dp4(struct tgsi_exec_machine *mach,
   3215          const struct tgsi_full_instruction *inst)
   3216 {
   3217    unsigned int chan;
   3218    union tgsi_exec_channel arg[3];
   3219 
   3220    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3221    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3222    micro_mul(&arg[2], &arg[0], &arg[1]);
   3223 
   3224    for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_W; chan++) {
   3225       fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
   3226       fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
   3227       micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
   3228    }
   3229 
   3230    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3231       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3232          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   3233       }
   3234    }
   3235 }
   3236 
   3237 static void
   3238 exec_dp2(struct tgsi_exec_machine *mach,
   3239          const struct tgsi_full_instruction *inst)
   3240 {
   3241    unsigned int chan;
   3242    union tgsi_exec_channel arg[3];
   3243 
   3244    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3245    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3246    micro_mul(&arg[2], &arg[0], &arg[1]);
   3247 
   3248    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3249    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3250    micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
   3251 
   3252    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3253       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3254          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   3255       }
   3256    }
   3257 }
   3258 
   3259 static void
   3260 exec_pk2h(struct tgsi_exec_machine *mach,
   3261           const struct tgsi_full_instruction *inst)
   3262 {
   3263    unsigned chan;
   3264    union tgsi_exec_channel arg[2], dst;
   3265 
   3266    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3267    fetch_source(mach, &arg[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3268    for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
   3269       dst.u[chan] = util_float_to_half(arg[0].f[chan]) |
   3270          (util_float_to_half(arg[1].f[chan]) << 16);
   3271    }
   3272    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3273       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3274          store_dest(mach, &dst, &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_UINT);
   3275       }
   3276    }
   3277 }
   3278 
   3279 static void
   3280 exec_up2h(struct tgsi_exec_machine *mach,
   3281           const struct tgsi_full_instruction *inst)
   3282 {
   3283    unsigned chan;
   3284    union tgsi_exec_channel arg, dst[2];
   3285 
   3286    fetch_source(mach, &arg, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
   3287    for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
   3288       dst[0].f[chan] = util_half_to_float(arg.u[chan] & 0xffff);
   3289       dst[1].f[chan] = util_half_to_float(arg.u[chan] >> 16);
   3290    }
   3291    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3292       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3293          store_dest(mach, &dst[chan & 1], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   3294       }
   3295    }
   3296 }
   3297 
   3298 static void
   3299 micro_ucmp(union tgsi_exec_channel *dst,
   3300            const union tgsi_exec_channel *src0,
   3301            const union tgsi_exec_channel *src1,
   3302            const union tgsi_exec_channel *src2)
   3303 {
   3304    dst->f[0] = src0->u[0] ? src1->f[0] : src2->f[0];
   3305    dst->f[1] = src0->u[1] ? src1->f[1] : src2->f[1];
   3306    dst->f[2] = src0->u[2] ? src1->f[2] : src2->f[2];
   3307    dst->f[3] = src0->u[3] ? src1->f[3] : src2->f[3];
   3308 }
   3309 
   3310 static void
   3311 exec_ucmp(struct tgsi_exec_machine *mach,
   3312           const struct tgsi_full_instruction *inst)
   3313 {
   3314    unsigned int chan;
   3315    struct tgsi_exec_vector dst;
   3316 
   3317    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3318       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3319          union tgsi_exec_channel src[3];
   3320 
   3321          fetch_source(mach, &src[0], &inst->Src[0], chan,
   3322                       TGSI_EXEC_DATA_UINT);
   3323          fetch_source(mach, &src[1], &inst->Src[1], chan,
   3324                       TGSI_EXEC_DATA_FLOAT);
   3325          fetch_source(mach, &src[2], &inst->Src[2], chan,
   3326                       TGSI_EXEC_DATA_FLOAT);
   3327          micro_ucmp(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
   3328       }
   3329    }
   3330    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3331       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3332          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan,
   3333                     TGSI_EXEC_DATA_FLOAT);
   3334       }
   3335    }
   3336 }
   3337 
   3338 static void
   3339 exec_dst(struct tgsi_exec_machine *mach,
   3340          const struct tgsi_full_instruction *inst)
   3341 {
   3342    union tgsi_exec_channel r[2];
   3343    union tgsi_exec_channel d[4];
   3344 
   3345    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
   3346       fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3347       fetch_source(mach, &r[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3348       micro_mul(&d[TGSI_CHAN_Y], &r[0], &r[1]);
   3349    }
   3350    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
   3351       fetch_source(mach, &d[TGSI_CHAN_Z], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   3352    }
   3353    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
   3354       fetch_source(mach, &d[TGSI_CHAN_W], &inst->Src[1], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   3355    }
   3356 
   3357    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
   3358       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3359    }
   3360    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
   3361       store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3362    }
   3363    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
   3364       store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   3365    }
   3366    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
   3367       store_dest(mach, &d[TGSI_CHAN_W], &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   3368    }
   3369 }
   3370 
   3371 static void
   3372 exec_log(struct tgsi_exec_machine *mach,
   3373          const struct tgsi_full_instruction *inst)
   3374 {
   3375    union tgsi_exec_channel r[3];
   3376 
   3377    fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3378    micro_abs(&r[2], &r[0]);  /* r2 = abs(r0) */
   3379    micro_lg2(&r[1], &r[2]);  /* r1 = lg2(r2) */
   3380    micro_flr(&r[0], &r[1]);  /* r0 = floor(r1) */
   3381    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
   3382       store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3383    }
   3384    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
   3385       micro_exp2(&r[0], &r[0]);       /* r0 = 2 ^ r0 */
   3386       micro_div(&r[0], &r[2], &r[0]); /* r0 = r2 / r0 */
   3387       store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3388    }
   3389    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
   3390       store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   3391    }
   3392    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
   3393       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   3394    }
   3395 }
   3396 
   3397 static void
   3398 exec_exp(struct tgsi_exec_machine *mach,
   3399          const struct tgsi_full_instruction *inst)
   3400 {
   3401    union tgsi_exec_channel r[3];
   3402 
   3403    fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3404    micro_flr(&r[1], &r[0]);  /* r1 = floor(r0) */
   3405    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
   3406       micro_exp2(&r[2], &r[1]);       /* r2 = 2 ^ r1 */
   3407       store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3408    }
   3409    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
   3410       micro_sub(&r[2], &r[0], &r[1]); /* r2 = r0 - r1 */
   3411       store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3412    }
   3413    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
   3414       micro_exp2(&r[2], &r[0]);       /* r2 = 2 ^ r0 */
   3415       store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   3416    }
   3417    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
   3418       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   3419    }
   3420 }
   3421 
   3422 static void
   3423 exec_lit(struct tgsi_exec_machine *mach,
   3424          const struct tgsi_full_instruction *inst)
   3425 {
   3426    union tgsi_exec_channel r[3];
   3427    union tgsi_exec_channel d[3];
   3428 
   3429    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YZ) {
   3430       fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3431       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
   3432          fetch_source(mach, &r[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3433          micro_max(&r[1], &r[1], &ZeroVec);
   3434 
   3435          fetch_source(mach, &r[2], &inst->Src[0], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   3436          micro_min(&r[2], &r[2], &P128Vec);
   3437          micro_max(&r[2], &r[2], &M128Vec);
   3438          micro_pow(&r[1], &r[1], &r[2]);
   3439          micro_lt(&d[TGSI_CHAN_Z], &ZeroVec, &r[0], &r[1], &ZeroVec);
   3440          store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
   3441       }
   3442       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
   3443          micro_max(&d[TGSI_CHAN_Y], &r[0], &ZeroVec);
   3444          store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
   3445       }
   3446    }
   3447    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
   3448       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
   3449    }
   3450 
   3451    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
   3452       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
   3453    }
   3454 }
   3455 
   3456 static void
   3457 exec_break(struct tgsi_exec_machine *mach)
   3458 {
   3459    if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
   3460       /* turn off loop channels for each enabled exec channel */
   3461       mach->LoopMask &= ~mach->ExecMask;
   3462       /* Todo: if mach->LoopMask == 0, jump to end of loop */
   3463       UPDATE_EXEC_MASK(mach);
   3464    } else {
   3465       assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
   3466 
   3467       mach->Switch.mask = 0x0;
   3468 
   3469       UPDATE_EXEC_MASK(mach);
   3470    }
   3471 }
   3472 
   3473 static void
   3474 exec_switch(struct tgsi_exec_machine *mach,
   3475             const struct tgsi_full_instruction *inst)
   3476 {
   3477    assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
   3478    assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
   3479 
   3480    mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
   3481    fetch_source(mach, &mach->Switch.selector, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
   3482    mach->Switch.mask = 0x0;
   3483    mach->Switch.defaultMask = 0x0;
   3484 
   3485    mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
   3486    mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
   3487 
   3488    UPDATE_EXEC_MASK(mach);
   3489 }
   3490 
   3491 static void
   3492 exec_case(struct tgsi_exec_machine *mach,
   3493           const struct tgsi_full_instruction *inst)
   3494 {
   3495    uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
   3496    union tgsi_exec_channel src;
   3497    uint mask = 0;
   3498 
   3499    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
   3500 
   3501    if (mach->Switch.selector.u[0] == src.u[0]) {
   3502       mask |= 0x1;
   3503    }
   3504    if (mach->Switch.selector.u[1] == src.u[1]) {
   3505       mask |= 0x2;
   3506    }
   3507    if (mach->Switch.selector.u[2] == src.u[2]) {
   3508       mask |= 0x4;
   3509    }
   3510    if (mach->Switch.selector.u[3] == src.u[3]) {
   3511       mask |= 0x8;
   3512    }
   3513 
   3514    mach->Switch.defaultMask |= mask;
   3515 
   3516    mach->Switch.mask |= mask & prevMask;
   3517 
   3518    UPDATE_EXEC_MASK(mach);
   3519 }
   3520 
   3521 /* FIXME: this will only work if default is last */
   3522 static void
   3523 exec_default(struct tgsi_exec_machine *mach)
   3524 {
   3525    uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
   3526 
   3527    mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
   3528 
   3529    UPDATE_EXEC_MASK(mach);
   3530 }
   3531 
   3532 static void
   3533 exec_endswitch(struct tgsi_exec_machine *mach)
   3534 {
   3535    mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
   3536    mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
   3537 
   3538    UPDATE_EXEC_MASK(mach);
   3539 }
   3540 
   3541 typedef void (* micro_dop)(union tgsi_double_channel *dst,
   3542                            const union tgsi_double_channel *src);
   3543 
   3544 typedef void (* micro_dop_sop)(union tgsi_double_channel *dst,
   3545                                const union tgsi_double_channel *src0,
   3546                                union tgsi_exec_channel *src1);
   3547 
   3548 typedef void (* micro_dop_s)(union tgsi_double_channel *dst,
   3549                              const union tgsi_exec_channel *src);
   3550 
   3551 typedef void (* micro_sop_d)(union tgsi_exec_channel *dst,
   3552                              const union tgsi_double_channel *src);
   3553 
   3554 static void
   3555 fetch_double_channel(struct tgsi_exec_machine *mach,
   3556                      union tgsi_double_channel *chan,
   3557                      const struct tgsi_full_src_register *reg,
   3558                      uint chan_0,
   3559                      uint chan_1)
   3560 {
   3561    union tgsi_exec_channel src[2];
   3562    uint i;
   3563 
   3564    fetch_source_d(mach, &src[0], reg, chan_0, TGSI_EXEC_DATA_UINT);
   3565    fetch_source_d(mach, &src[1], reg, chan_1, TGSI_EXEC_DATA_UINT);
   3566 
   3567    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   3568       chan->u[i][0] = src[0].u[i];
   3569       chan->u[i][1] = src[1].u[i];
   3570    }
   3571    if (reg->Register.Absolute) {
   3572       micro_dabs(chan, chan);
   3573    }
   3574    if (reg->Register.Negate) {
   3575       micro_dneg(chan, chan);
   3576    }
   3577 }
   3578 
   3579 static void
   3580 store_double_channel(struct tgsi_exec_machine *mach,
   3581                      const union tgsi_double_channel *chan,
   3582                      const struct tgsi_full_dst_register *reg,
   3583                      const struct tgsi_full_instruction *inst,
   3584                      uint chan_0,
   3585                      uint chan_1)
   3586 {
   3587    union tgsi_exec_channel dst[2];
   3588    uint i;
   3589    union tgsi_double_channel temp;
   3590    const uint execmask = mach->ExecMask;
   3591 
   3592    if (!inst->Instruction.Saturate) {
   3593       for (i = 0; i < TGSI_QUAD_SIZE; i++)
   3594          if (execmask & (1 << i)) {
   3595             dst[0].u[i] = chan->u[i][0];
   3596             dst[1].u[i] = chan->u[i][1];
   3597          }
   3598    }
   3599    else {
   3600       for (i = 0; i < TGSI_QUAD_SIZE; i++)
   3601          if (execmask & (1 << i)) {
   3602             if (chan->d[i] < 0.0)
   3603                temp.d[i] = 0.0;
   3604             else if (chan->d[i] > 1.0)
   3605                temp.d[i] = 1.0;
   3606             else
   3607                temp.d[i] = chan->d[i];
   3608 
   3609             dst[0].u[i] = temp.u[i][0];
   3610             dst[1].u[i] = temp.u[i][1];
   3611          }
   3612    }
   3613 
   3614    store_dest_double(mach, &dst[0], reg, inst, chan_0, TGSI_EXEC_DATA_UINT);
   3615    if (chan_1 != -1)
   3616       store_dest_double(mach, &dst[1], reg, inst, chan_1, TGSI_EXEC_DATA_UINT);
   3617 }
   3618 
   3619 static void
   3620 exec_double_unary(struct tgsi_exec_machine *mach,
   3621                   const struct tgsi_full_instruction *inst,
   3622                   micro_dop op)
   3623 {
   3624    union tgsi_double_channel src;
   3625    union tgsi_double_channel dst;
   3626 
   3627    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
   3628       fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
   3629       op(&dst, &src);
   3630       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
   3631    }
   3632    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
   3633       fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
   3634       op(&dst, &src);
   3635       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
   3636    }
   3637 }
   3638 
   3639 static void
   3640 exec_double_binary(struct tgsi_exec_machine *mach,
   3641                    const struct tgsi_full_instruction *inst,
   3642                    micro_dop op,
   3643                    enum tgsi_exec_datatype dst_datatype)
   3644 {
   3645    union tgsi_double_channel src[2];
   3646    union tgsi_double_channel dst;
   3647    int first_dest_chan, second_dest_chan;
   3648    int wmask;
   3649 
   3650    wmask = inst->Dst[0].Register.WriteMask;
   3651    /* these are & because of the way DSLT etc store their destinations */
   3652    if (wmask & TGSI_WRITEMASK_XY) {
   3653       first_dest_chan = TGSI_CHAN_X;
   3654       second_dest_chan = TGSI_CHAN_Y;
   3655       if (dst_datatype == TGSI_EXEC_DATA_UINT) {
   3656          first_dest_chan = (wmask & TGSI_WRITEMASK_X) ? TGSI_CHAN_X : TGSI_CHAN_Y;
   3657          second_dest_chan = -1;
   3658       }
   3659 
   3660       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
   3661       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, TGSI_CHAN_Y);
   3662       op(&dst, src);
   3663       store_double_channel(mach, &dst, &inst->Dst[0], inst, first_dest_chan, second_dest_chan);
   3664    }
   3665 
   3666    if (wmask & TGSI_WRITEMASK_ZW) {
   3667       first_dest_chan = TGSI_CHAN_Z;
   3668       second_dest_chan = TGSI_CHAN_W;
   3669       if (dst_datatype == TGSI_EXEC_DATA_UINT) {
   3670          first_dest_chan = (wmask & TGSI_WRITEMASK_Z) ? TGSI_CHAN_Z : TGSI_CHAN_W;
   3671          second_dest_chan = -1;
   3672       }
   3673 
   3674       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
   3675       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_CHAN_W);
   3676       op(&dst, src);
   3677       store_double_channel(mach, &dst, &inst->Dst[0], inst, first_dest_chan, second_dest_chan);
   3678    }
   3679 }
   3680 
   3681 static void
   3682 exec_double_trinary(struct tgsi_exec_machine *mach,
   3683                     const struct tgsi_full_instruction *inst,
   3684                     micro_dop op)
   3685 {
   3686    union tgsi_double_channel src[3];
   3687    union tgsi_double_channel dst;
   3688 
   3689    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
   3690       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
   3691       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, TGSI_CHAN_Y);
   3692       fetch_double_channel(mach, &src[2], &inst->Src[2], TGSI_CHAN_X, TGSI_CHAN_Y);
   3693       op(&dst, src);
   3694       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
   3695    }
   3696    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
   3697       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
   3698       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_CHAN_W);
   3699       fetch_double_channel(mach, &src[2], &inst->Src[2], TGSI_CHAN_Z, TGSI_CHAN_W);
   3700       op(&dst, src);
   3701       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
   3702    }
   3703 }
   3704 
   3705 static void
   3706 exec_dldexp(struct tgsi_exec_machine *mach,
   3707             const struct tgsi_full_instruction *inst)
   3708 {
   3709    union tgsi_double_channel src0;
   3710    union tgsi_exec_channel src1;
   3711    union tgsi_double_channel dst;
   3712    int wmask;
   3713 
   3714    wmask = inst->Dst[0].Register.WriteMask;
   3715    if (wmask & TGSI_WRITEMASK_XY) {
   3716       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
   3717       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
   3718       micro_dldexp(&dst, &src0, &src1);
   3719       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
   3720    }
   3721 
   3722    if (wmask & TGSI_WRITEMASK_ZW) {
   3723       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
   3724       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_INT);
   3725       micro_dldexp(&dst, &src0, &src1);
   3726       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
   3727    }
   3728 }
   3729 
   3730 static void
   3731 exec_dfracexp(struct tgsi_exec_machine *mach,
   3732               const struct tgsi_full_instruction *inst)
   3733 {
   3734    union tgsi_double_channel src;
   3735    union tgsi_double_channel dst;
   3736    union tgsi_exec_channel dst_exp;
   3737 
   3738    fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
   3739    micro_dfracexp(&dst, &dst_exp, &src);
   3740    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY)
   3741       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
   3742    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW)
   3743       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
   3744    for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3745       if (inst->Dst[1].Register.WriteMask & (1 << chan))
   3746          store_dest(mach, &dst_exp, &inst->Dst[1], inst, chan, TGSI_EXEC_DATA_INT);
   3747    }
   3748 }
   3749 
   3750 static void
   3751 exec_arg0_64_arg1_32(struct tgsi_exec_machine *mach,
   3752             const struct tgsi_full_instruction *inst,
   3753             micro_dop_sop op)
   3754 {
   3755    union tgsi_double_channel src0;
   3756    union tgsi_exec_channel src1;
   3757    union tgsi_double_channel dst;
   3758    int wmask;
   3759 
   3760    wmask = inst->Dst[0].Register.WriteMask;
   3761    if (wmask & TGSI_WRITEMASK_XY) {
   3762       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
   3763       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
   3764       op(&dst, &src0, &src1);
   3765       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
   3766    }
   3767 
   3768    if (wmask & TGSI_WRITEMASK_ZW) {
   3769       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
   3770       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_INT);
   3771       op(&dst, &src0, &src1);
   3772       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
   3773    }
   3774 }
   3775 
   3776 static int
   3777 get_image_coord_dim(unsigned tgsi_tex)
   3778 {
   3779    int dim;
   3780    switch (tgsi_tex) {
   3781    case TGSI_TEXTURE_BUFFER:
   3782    case TGSI_TEXTURE_1D:
   3783       dim = 1;
   3784       break;
   3785    case TGSI_TEXTURE_2D:
   3786    case TGSI_TEXTURE_RECT:
   3787    case TGSI_TEXTURE_1D_ARRAY:
   3788    case TGSI_TEXTURE_2D_MSAA:
   3789       dim = 2;
   3790       break;
   3791    case TGSI_TEXTURE_3D:
   3792    case TGSI_TEXTURE_CUBE:
   3793    case TGSI_TEXTURE_2D_ARRAY:
   3794    case TGSI_TEXTURE_2D_ARRAY_MSAA:
   3795    case TGSI_TEXTURE_CUBE_ARRAY:
   3796       dim = 3;
   3797       break;
   3798    default:
   3799       assert(!"unknown texture target");
   3800       dim = 0;
   3801       break;
   3802    }
   3803 
   3804    return dim;
   3805 }
   3806 
   3807 static int
   3808 get_image_coord_sample(unsigned tgsi_tex)
   3809 {
   3810    int sample = 0;
   3811    switch (tgsi_tex) {
   3812    case TGSI_TEXTURE_2D_MSAA:
   3813       sample = 3;
   3814       break;
   3815    case TGSI_TEXTURE_2D_ARRAY_MSAA:
   3816       sample = 4;
   3817       break;
   3818    default:
   3819       break;
   3820    }
   3821    return sample;
   3822 }
   3823 
   3824 static void
   3825 exec_load_img(struct tgsi_exec_machine *mach,
   3826               const struct tgsi_full_instruction *inst)
   3827 {
   3828    union tgsi_exec_channel r[4], sample_r;
   3829    uint unit;
   3830    int sample;
   3831    int i, j;
   3832    int dim;
   3833    uint chan;
   3834    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
   3835    struct tgsi_image_params params;
   3836    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
   3837 
   3838    unit = fetch_sampler_unit(mach, inst, 0);
   3839    dim = get_image_coord_dim(inst->Memory.Texture);
   3840    sample = get_image_coord_sample(inst->Memory.Texture);
   3841    assert(dim <= 3);
   3842 
   3843    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
   3844    params.unit = unit;
   3845    params.tgsi_tex_instr = inst->Memory.Texture;
   3846    params.format = inst->Memory.Format;
   3847 
   3848    for (i = 0; i < dim; i++) {
   3849       IFETCH(&r[i], 1, TGSI_CHAN_X + i);
   3850    }
   3851 
   3852    if (sample)
   3853       IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
   3854 
   3855    mach->Image->load(mach->Image, &params,
   3856                      r[0].i, r[1].i, r[2].i, sample_r.i,
   3857                      rgba);
   3858    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
   3859       r[0].f[j] = rgba[0][j];
   3860       r[1].f[j] = rgba[1][j];
   3861       r[2].f[j] = rgba[2][j];
   3862       r[3].f[j] = rgba[3][j];
   3863    }
   3864    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3865       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3866          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   3867       }
   3868    }
   3869 }
   3870 
   3871 static void
   3872 exec_load_buf(struct tgsi_exec_machine *mach,
   3873               const struct tgsi_full_instruction *inst)
   3874 {
   3875    union tgsi_exec_channel r[4];
   3876    uint unit;
   3877    int j;
   3878    uint chan;
   3879    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
   3880    struct tgsi_buffer_params params;
   3881    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
   3882 
   3883    unit = fetch_sampler_unit(mach, inst, 0);
   3884 
   3885    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
   3886    params.unit = unit;
   3887    IFETCH(&r[0], 1, TGSI_CHAN_X);
   3888 
   3889    mach->Buffer->load(mach->Buffer, &params,
   3890                       r[0].i, rgba);
   3891    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
   3892       r[0].f[j] = rgba[0][j];
   3893       r[1].f[j] = rgba[1][j];
   3894       r[2].f[j] = rgba[2][j];
   3895       r[3].f[j] = rgba[3][j];
   3896    }
   3897    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3898       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3899          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   3900       }
   3901    }
   3902 }
   3903 
   3904 static void
   3905 exec_load_mem(struct tgsi_exec_machine *mach,
   3906               const struct tgsi_full_instruction *inst)
   3907 {
   3908    union tgsi_exec_channel r[4];
   3909    uint chan;
   3910    char *ptr = mach->LocalMem;
   3911    uint32_t offset;
   3912    int j;
   3913 
   3914    IFETCH(&r[0], 1, TGSI_CHAN_X);
   3915    if (r[0].u[0] >= mach->LocalMemSize)
   3916       return;
   3917 
   3918    offset = r[0].u[0];
   3919    ptr += offset;
   3920 
   3921    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
   3922       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3923          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3924             memcpy(&r[chan].u[j], ptr + (4 * chan), 4);
   3925          }
   3926       }
   3927    }
   3928 
   3929    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   3930       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   3931          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   3932       }
   3933    }
   3934 }
   3935 
   3936 static void
   3937 exec_load(struct tgsi_exec_machine *mach,
   3938           const struct tgsi_full_instruction *inst)
   3939 {
   3940    if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
   3941       exec_load_img(mach, inst);
   3942    else if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
   3943       exec_load_buf(mach, inst);
   3944    else if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
   3945       exec_load_mem(mach, inst);
   3946 }
   3947 
   3948 static void
   3949 exec_store_img(struct tgsi_exec_machine *mach,
   3950                const struct tgsi_full_instruction *inst)
   3951 {
   3952    union tgsi_exec_channel r[3], sample_r;
   3953    union tgsi_exec_channel value[4];
   3954    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
   3955    struct tgsi_image_params params;
   3956    int dim;
   3957    int sample;
   3958    int i, j;
   3959    uint unit;
   3960    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
   3961    unit = inst->Dst[0].Register.Index;
   3962    dim = get_image_coord_dim(inst->Memory.Texture);
   3963    sample = get_image_coord_sample(inst->Memory.Texture);
   3964    assert(dim <= 3);
   3965 
   3966    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
   3967    params.unit = unit;
   3968    params.tgsi_tex_instr = inst->Memory.Texture;
   3969    params.format = inst->Memory.Format;
   3970 
   3971    for (i = 0; i < dim; i++) {
   3972       IFETCH(&r[i], 0, TGSI_CHAN_X + i);
   3973    }
   3974 
   3975    for (i = 0; i < 4; i++) {
   3976       FETCH(&value[i], 1, TGSI_CHAN_X + i);
   3977    }
   3978    if (sample)
   3979       IFETCH(&sample_r, 0, TGSI_CHAN_X + sample);
   3980 
   3981    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
   3982       rgba[0][j] = value[0].f[j];
   3983       rgba[1][j] = value[1].f[j];
   3984       rgba[2][j] = value[2].f[j];
   3985       rgba[3][j] = value[3].f[j];
   3986    }
   3987 
   3988    mach->Image->store(mach->Image, &params,
   3989                       r[0].i, r[1].i, r[2].i, sample_r.i,
   3990                       rgba);
   3991 }
   3992 
   3993 static void
   3994 exec_store_buf(struct tgsi_exec_machine *mach,
   3995                const struct tgsi_full_instruction *inst)
   3996 {
   3997    union tgsi_exec_channel r[3];
   3998    union tgsi_exec_channel value[4];
   3999    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
   4000    struct tgsi_buffer_params params;
   4001    int i, j;
   4002    uint unit;
   4003    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
   4004 
   4005    unit = inst->Dst[0].Register.Index;
   4006 
   4007    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
   4008    params.unit = unit;
   4009    params.writemask = inst->Dst[0].Register.WriteMask;
   4010 
   4011    IFETCH(&r[0], 0, TGSI_CHAN_X);
   4012    for (i = 0; i < 4; i++) {
   4013       FETCH(&value[i], 1, TGSI_CHAN_X + i);
   4014    }
   4015 
   4016    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
   4017       rgba[0][j] = value[0].f[j];
   4018       rgba[1][j] = value[1].f[j];
   4019       rgba[2][j] = value[2].f[j];
   4020       rgba[3][j] = value[3].f[j];
   4021    }
   4022 
   4023    mach->Buffer->store(mach->Buffer, &params,
   4024                       r[0].i,
   4025                       rgba);
   4026 }
   4027 
   4028 static void
   4029 exec_store_mem(struct tgsi_exec_machine *mach,
   4030                const struct tgsi_full_instruction *inst)
   4031 {
   4032    union tgsi_exec_channel r[3];
   4033    union tgsi_exec_channel value[4];
   4034    uint i, chan;
   4035    char *ptr = mach->LocalMem;
   4036    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
   4037    int execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
   4038 
   4039    IFETCH(&r[0], 0, TGSI_CHAN_X);
   4040 
   4041    for (i = 0; i < 4; i++) {
   4042       FETCH(&value[i], 1, TGSI_CHAN_X + i);
   4043    }
   4044 
   4045    if (r[0].u[0] >= mach->LocalMemSize)
   4046       return;
   4047    ptr += r[0].u[0];
   4048 
   4049    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   4050       if (execmask & (1 << i)) {
   4051          for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   4052             if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   4053                memcpy(ptr + (chan * 4), &value[chan].u[0], 4);
   4054             }
   4055          }
   4056       }
   4057    }
   4058 }
   4059 
   4060 static void
   4061 exec_store(struct tgsi_exec_machine *mach,
   4062            const struct tgsi_full_instruction *inst)
   4063 {
   4064    if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE)
   4065       exec_store_img(mach, inst);
   4066    else if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER)
   4067       exec_store_buf(mach, inst);
   4068    else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY)
   4069       exec_store_mem(mach, inst);
   4070 }
   4071 
   4072 static void
   4073 exec_atomop_img(struct tgsi_exec_machine *mach,
   4074                 const struct tgsi_full_instruction *inst)
   4075 {
   4076    union tgsi_exec_channel r[4], sample_r;
   4077    union tgsi_exec_channel value[4], value2[4];
   4078    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
   4079    float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
   4080    struct tgsi_image_params params;
   4081    int dim;
   4082    int sample;
   4083    int i, j;
   4084    uint unit, chan;
   4085    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
   4086    unit = fetch_sampler_unit(mach, inst, 0);
   4087    dim = get_image_coord_dim(inst->Memory.Texture);
   4088    sample = get_image_coord_sample(inst->Memory.Texture);
   4089    assert(dim <= 3);
   4090 
   4091    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
   4092    params.unit = unit;
   4093    params.tgsi_tex_instr = inst->Memory.Texture;
   4094    params.format = inst->Memory.Format;
   4095 
   4096    for (i = 0; i < dim; i++) {
   4097       IFETCH(&r[i], 1, TGSI_CHAN_X + i);
   4098    }
   4099 
   4100    for (i = 0; i < 4; i++) {
   4101       FETCH(&value[i], 2, TGSI_CHAN_X + i);
   4102       if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
   4103          FETCH(&value2[i], 3, TGSI_CHAN_X + i);
   4104    }
   4105    if (sample)
   4106       IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
   4107 
   4108    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
   4109       rgba[0][j] = value[0].f[j];
   4110       rgba[1][j] = value[1].f[j];
   4111       rgba[2][j] = value[2].f[j];
   4112       rgba[3][j] = value[3].f[j];
   4113    }
   4114    if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
   4115       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
   4116          rgba2[0][j] = value2[0].f[j];
   4117          rgba2[1][j] = value2[1].f[j];
   4118          rgba2[2][j] = value2[2].f[j];
   4119          rgba2[3][j] = value2[3].f[j];
   4120       }
   4121    }
   4122 
   4123    mach->Image->op(mach->Image, &params, inst->Instruction.Opcode,
   4124                    r[0].i, r[1].i, r[2].i, sample_r.i,
   4125                    rgba, rgba2);
   4126 
   4127    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
   4128       r[0].f[j] = rgba[0][j];
   4129       r[1].f[j] = rgba[1][j];
   4130       r[2].f[j] = rgba[2][j];
   4131       r[3].f[j] = rgba[3][j];
   4132    }
   4133    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   4134       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   4135          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   4136       }
   4137    }
   4138 }
   4139 
   4140 static void
   4141 exec_atomop_buf(struct tgsi_exec_machine *mach,
   4142                 const struct tgsi_full_instruction *inst)
   4143 {
   4144    union tgsi_exec_channel r[4];
   4145    union tgsi_exec_channel value[4], value2[4];
   4146    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
   4147    float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
   4148    struct tgsi_buffer_params params;
   4149    int i, j;
   4150    uint unit, chan;
   4151    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
   4152 
   4153    unit = fetch_sampler_unit(mach, inst, 0);
   4154 
   4155    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
   4156    params.unit = unit;
   4157    params.writemask = inst->Dst[0].Register.WriteMask;
   4158 
   4159    IFETCH(&r[0], 1, TGSI_CHAN_X);
   4160 
   4161    for (i = 0; i < 4; i++) {
   4162       FETCH(&value[i], 2, TGSI_CHAN_X + i);
   4163       if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
   4164          FETCH(&value2[i], 3, TGSI_CHAN_X + i);
   4165    }
   4166 
   4167    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
   4168       rgba[0][j] = value[0].f[j];
   4169       rgba[1][j] = value[1].f[j];
   4170       rgba[2][j] = value[2].f[j];
   4171       rgba[3][j] = value[3].f[j];
   4172    }
   4173    if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
   4174       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
   4175          rgba2[0][j] = value2[0].f[j];
   4176          rgba2[1][j] = value2[1].f[j];
   4177          rgba2[2][j] = value2[2].f[j];
   4178          rgba2[3][j] = value2[3].f[j];
   4179       }
   4180    }
   4181 
   4182    mach->Buffer->op(mach->Buffer, &params, inst->Instruction.Opcode,
   4183                    r[0].i,
   4184                    rgba, rgba2);
   4185 
   4186    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
   4187       r[0].f[j] = rgba[0][j];
   4188       r[1].f[j] = rgba[1][j];
   4189       r[2].f[j] = rgba[2][j];
   4190       r[3].f[j] = rgba[3][j];
   4191    }
   4192    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   4193       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   4194          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   4195       }
   4196    }
   4197 }
   4198 
   4199 static void
   4200 exec_atomop_mem(struct tgsi_exec_machine *mach,
   4201                 const struct tgsi_full_instruction *inst)
   4202 {
   4203    union tgsi_exec_channel r[4];
   4204    union tgsi_exec_channel value[4], value2[4];
   4205    char *ptr = mach->LocalMem;
   4206    uint32_t val;
   4207    uint chan, i;
   4208    uint32_t offset;
   4209    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
   4210    int execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
   4211    IFETCH(&r[0], 1, TGSI_CHAN_X);
   4212 
   4213    if (r[0].u[0] >= mach->LocalMemSize)
   4214       return;
   4215 
   4216    offset = r[0].u[0];
   4217    ptr += offset;
   4218    for (i = 0; i < 4; i++) {
   4219       FETCH(&value[i], 2, TGSI_CHAN_X + i);
   4220       if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
   4221          FETCH(&value2[i], 3, TGSI_CHAN_X + i);
   4222    }
   4223 
   4224    memcpy(&r[0].u[0], ptr, 4);
   4225    val = r[0].u[0];
   4226    switch (inst->Instruction.Opcode) {
   4227    case TGSI_OPCODE_ATOMUADD:
   4228       val += value[0].u[0];
   4229       break;
   4230    case TGSI_OPCODE_ATOMXOR:
   4231       val ^= value[0].u[0];
   4232       break;
   4233    case TGSI_OPCODE_ATOMOR:
   4234       val |= value[0].u[0];
   4235       break;
   4236    case TGSI_OPCODE_ATOMAND:
   4237       val &= value[0].u[0];
   4238       break;
   4239    case TGSI_OPCODE_ATOMUMIN:
   4240       val = MIN2(val, value[0].u[0]);
   4241       break;
   4242    case TGSI_OPCODE_ATOMUMAX:
   4243       val = MAX2(val, value[0].u[0]);
   4244       break;
   4245    case TGSI_OPCODE_ATOMIMIN:
   4246       val = MIN2(r[0].i[0], value[0].i[0]);
   4247       break;
   4248    case TGSI_OPCODE_ATOMIMAX:
   4249       val = MAX2(r[0].i[0], value[0].i[0]);
   4250       break;
   4251    case TGSI_OPCODE_ATOMXCHG:
   4252       val = value[0].i[0];
   4253       break;
   4254    case TGSI_OPCODE_ATOMCAS:
   4255       if (val == value[0].u[0])
   4256          val = value2[0].u[0];
   4257       break;
   4258    default:
   4259       break;
   4260    }
   4261    for (i = 0; i < TGSI_QUAD_SIZE; i++)
   4262       if (execmask & (1 << i))
   4263          memcpy(ptr, &val, 4);
   4264 
   4265    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   4266       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   4267          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
   4268       }
   4269    }
   4270 }
   4271 
   4272 static void
   4273 exec_atomop(struct tgsi_exec_machine *mach,
   4274             const struct tgsi_full_instruction *inst)
   4275 {
   4276    if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
   4277       exec_atomop_img(mach, inst);
   4278    else if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
   4279       exec_atomop_buf(mach, inst);
   4280    else if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
   4281       exec_atomop_mem(mach, inst);
   4282 }
   4283 
   4284 static void
   4285 exec_resq_img(struct tgsi_exec_machine *mach,
   4286               const struct tgsi_full_instruction *inst)
   4287 {
   4288    int result[4];
   4289    union tgsi_exec_channel r[4];
   4290    uint unit;
   4291    int i, chan, j;
   4292    struct tgsi_image_params params;
   4293    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
   4294 
   4295    unit = fetch_sampler_unit(mach, inst, 0);
   4296 
   4297    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
   4298    params.unit = unit;
   4299    params.tgsi_tex_instr = inst->Memory.Texture;
   4300    params.format = inst->Memory.Format;
   4301 
   4302    mach->Image->get_dims(mach->Image, &params, result);
   4303 
   4304    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   4305       for (j = 0; j < 4; j++) {
   4306          r[j].i[i] = result[j];
   4307       }
   4308    }
   4309 
   4310    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   4311       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   4312          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
   4313                     TGSI_EXEC_DATA_INT);
   4314       }
   4315    }
   4316 }
   4317 
   4318 static void
   4319 exec_resq_buf(struct tgsi_exec_machine *mach,
   4320               const struct tgsi_full_instruction *inst)
   4321 {
   4322    int result;
   4323    union tgsi_exec_channel r[4];
   4324    uint unit;
   4325    int i, chan;
   4326    struct tgsi_buffer_params params;
   4327    int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
   4328 
   4329    unit = fetch_sampler_unit(mach, inst, 0);
   4330 
   4331    params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
   4332    params.unit = unit;
   4333 
   4334    mach->Buffer->get_dims(mach->Buffer, &params, &result);
   4335 
   4336    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
   4337       r[0].i[i] = result;
   4338    }
   4339 
   4340    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
   4341       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
   4342          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
   4343                     TGSI_EXEC_DATA_INT);
   4344       }
   4345    }
   4346 }
   4347 
   4348 static void
   4349 exec_resq(struct tgsi_exec_machine *mach,
   4350           const struct tgsi_full_instruction *inst)
   4351 {
   4352    if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
   4353       exec_resq_img(mach, inst);
   4354    else
   4355       exec_resq_buf(mach, inst);
   4356 }
   4357 
   4358 static void
   4359 micro_f2u64(union tgsi_double_channel *dst,
   4360             const union tgsi_exec_channel *src)
   4361 {
   4362    dst->u64[0] = (uint64_t)src->f[0];
   4363    dst->u64[1] = (uint64_t)src->f[1];
   4364    dst->u64[2] = (uint64_t)src->f[2];
   4365    dst->u64[3] = (uint64_t)src->f[3];
   4366 }
   4367 
   4368 static void
   4369 micro_f2i64(union tgsi_double_channel *dst,
   4370             const union tgsi_exec_channel *src)
   4371 {
   4372    dst->i64[0] = (int64_t)src->f[0];
   4373    dst->i64[1] = (int64_t)src->f[1];
   4374    dst->i64[2] = (int64_t)src->f[2];
   4375    dst->i64[3] = (int64_t)src->f[3];
   4376 }
   4377 
   4378 static void
   4379 micro_u2i64(union tgsi_double_channel *dst,
   4380             const union tgsi_exec_channel *src)
   4381 {
   4382    dst->u64[0] = (uint64_t)src->u[0];
   4383    dst->u64[1] = (uint64_t)src->u[1];
   4384    dst->u64[2] = (uint64_t)src->u[2];
   4385    dst->u64[3] = (uint64_t)src->u[3];
   4386 }
   4387 
   4388 static void
   4389 micro_i2i64(union tgsi_double_channel *dst,
   4390             const union tgsi_exec_channel *src)
   4391 {
   4392    dst->i64[0] = (int64_t)src->i[0];
   4393    dst->i64[1] = (int64_t)src->i[1];
   4394    dst->i64[2] = (int64_t)src->i[2];
   4395    dst->i64[3] = (int64_t)src->i[3];
   4396 }
   4397 
   4398 static void
   4399 micro_d2u64(union tgsi_double_channel *dst,
   4400            const union tgsi_double_channel *src)
   4401 {
   4402    dst->u64[0] = (uint64_t)src->d[0];
   4403    dst->u64[1] = (uint64_t)src->d[1];
   4404    dst->u64[2] = (uint64_t)src->d[2];
   4405    dst->u64[3] = (uint64_t)src->d[3];
   4406 }
   4407 
   4408 static void
   4409 micro_d2i64(union tgsi_double_channel *dst,
   4410            const union tgsi_double_channel *src)
   4411 {
   4412    dst->i64[0] = (int64_t)src->d[0];
   4413    dst->i64[1] = (int64_t)src->d[1];
   4414    dst->i64[2] = (int64_t)src->d[2];
   4415    dst->i64[3] = (int64_t)src->d[3];
   4416 }
   4417 
   4418 static void
   4419 micro_u642d(union tgsi_double_channel *dst,
   4420            const union tgsi_double_channel *src)
   4421 {
   4422    dst->d[0] = (double)src->u64[0];
   4423    dst->d[1] = (double)src->u64[1];
   4424    dst->d[2] = (double)src->u64[2];
   4425    dst->d[3] = (double)src->u64[3];
   4426 }
   4427 
   4428 static void
   4429 micro_i642d(union tgsi_double_channel *dst,
   4430            const union tgsi_double_channel *src)
   4431 {
   4432    dst->d[0] = (double)src->i64[0];
   4433    dst->d[1] = (double)src->i64[1];
   4434    dst->d[2] = (double)src->i64[2];
   4435    dst->d[3] = (double)src->i64[3];
   4436 }
   4437 
   4438 static void
   4439 micro_u642f(union tgsi_exec_channel *dst,
   4440             const union tgsi_double_channel *src)
   4441 {
   4442    dst->f[0] = (float)src->u64[0];
   4443    dst->f[1] = (float)src->u64[1];
   4444    dst->f[2] = (float)src->u64[2];
   4445    dst->f[3] = (float)src->u64[3];
   4446 }
   4447 
   4448 static void
   4449 micro_i642f(union tgsi_exec_channel *dst,
   4450             const union tgsi_double_channel *src)
   4451 {
   4452    dst->f[0] = (float)src->i64[0];
   4453    dst->f[1] = (float)src->i64[1];
   4454    dst->f[2] = (float)src->i64[2];
   4455    dst->f[3] = (float)src->i64[3];
   4456 }
   4457 
   4458 static void
   4459 exec_t_2_64(struct tgsi_exec_machine *mach,
   4460           const struct tgsi_full_instruction *inst,
   4461           micro_dop_s op,
   4462           enum tgsi_exec_datatype src_datatype)
   4463 {
   4464    union tgsi_exec_channel src;
   4465    union tgsi_double_channel dst;
   4466 
   4467    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
   4468       fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, src_datatype);
   4469       op(&dst, &src);
   4470       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
   4471    }
   4472    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
   4473       fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_Y, src_datatype);
   4474       op(&dst, &src);
   4475       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
   4476    }
   4477 }
   4478 
   4479 static void
   4480 exec_64_2_t(struct tgsi_exec_machine *mach,
   4481             const struct tgsi_full_instruction *inst,
   4482             micro_sop_d op,
   4483             enum tgsi_exec_datatype dst_datatype)
   4484 {
   4485    union tgsi_double_channel src;
   4486    union tgsi_exec_channel dst;
   4487    int wm = inst->Dst[0].Register.WriteMask;
   4488    int i;
   4489    int bit;
   4490    for (i = 0; i < 2; i++) {
   4491       bit = ffs(wm);
   4492       if (bit) {
   4493          wm &= ~(1 << (bit - 1));
   4494          if (i == 0)
   4495             fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
   4496          else
   4497             fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
   4498          op(&dst, &src);
   4499          store_dest(mach, &dst, &inst->Dst[0], inst, bit - 1, dst_datatype);
   4500       }
   4501    }
   4502 }
   4503 
   4504 static void
   4505 micro_i2f(union tgsi_exec_channel *dst,
   4506           const union tgsi_exec_channel *src)
   4507 {
   4508    dst->f[0] = (float)src->i[0];
   4509    dst->f[1] = (float)src->i[1];
   4510    dst->f[2] = (float)src->i[2];
   4511    dst->f[3] = (float)src->i[3];
   4512 }
   4513 
   4514 static void
   4515 micro_not(union tgsi_exec_channel *dst,
   4516           const union tgsi_exec_channel *src)
   4517 {
   4518    dst->u[0] = ~src->u[0];
   4519    dst->u[1] = ~src->u[1];
   4520    dst->u[2] = ~src->u[2];
   4521    dst->u[3] = ~src->u[3];
   4522 }
   4523 
   4524 static void
   4525 micro_shl(union tgsi_exec_channel *dst,
   4526           const union tgsi_exec_channel *src0,
   4527           const union tgsi_exec_channel *src1)
   4528 {
   4529    unsigned masked_count;
   4530    masked_count = src1->u[0] & 0x1f;
   4531    dst->u[0] = src0->u[0] << masked_count;
   4532    masked_count = src1->u[1] & 0x1f;
   4533    dst->u[1] = src0->u[1] << masked_count;
   4534    masked_count = src1->u[2] & 0x1f;
   4535    dst->u[2] = src0->u[2] << masked_count;
   4536    masked_count = src1->u[3] & 0x1f;
   4537    dst->u[3] = src0->u[3] << masked_count;
   4538 }
   4539 
   4540 static void
   4541 micro_and(union tgsi_exec_channel *dst,
   4542           const union tgsi_exec_channel *src0,
   4543           const union tgsi_exec_channel *src1)
   4544 {
   4545    dst->u[0] = src0->u[0] & src1->u[0];
   4546    dst->u[1] = src0->u[1] & src1->u[1];
   4547    dst->u[2] = src0->u[2] & src1->u[2];
   4548    dst->u[3] = src0->u[3] & src1->u[3];
   4549 }
   4550 
   4551 static void
   4552 micro_or(union tgsi_exec_channel *dst,
   4553          const union tgsi_exec_channel *src0,
   4554          const union tgsi_exec_channel *src1)
   4555 {
   4556    dst->u[0] = src0->u[0] | src1->u[0];
   4557    dst->u[1] = src0->u[1] | src1->u[1];
   4558    dst->u[2] = src0->u[2] | src1->u[2];
   4559    dst->u[3] = src0->u[3] | src1->u[3];
   4560 }
   4561 
   4562 static void
   4563 micro_xor(union tgsi_exec_channel *dst,
   4564           const union tgsi_exec_channel *src0,
   4565           const union tgsi_exec_channel *src1)
   4566 {
   4567    dst->u[0] = src0->u[0] ^ src1->u[0];
   4568    dst->u[1] = src0->u[1] ^ src1->u[1];
   4569    dst->u[2] = src0->u[2] ^ src1->u[2];
   4570    dst->u[3] = src0->u[3] ^ src1->u[3];
   4571 }
   4572 
   4573 static void
   4574 micro_mod(union tgsi_exec_channel *dst,
   4575           const union tgsi_exec_channel *src0,
   4576           const union tgsi_exec_channel *src1)
   4577 {
   4578    dst->i[0] = src1->i[0] ? src0->i[0] % src1->i[0] : ~0;
   4579    dst->i[1] = src1->i[1] ? src0->i[1] % src1->i[1] : ~0;
   4580    dst->i[2] = src1->i[2] ? src0->i[2] % src1->i[2] : ~0;
   4581    dst->i[3] = src1->i[3] ? src0->i[3] % src1->i[3] : ~0;
   4582 }
   4583 
   4584 static void
   4585 micro_f2i(union tgsi_exec_channel *dst,
   4586           const union tgsi_exec_channel *src)
   4587 {
   4588    dst->i[0] = (int)src->f[0];
   4589    dst->i[1] = (int)src->f[1];
   4590    dst->i[2] = (int)src->f[2];
   4591    dst->i[3] = (int)src->f[3];
   4592 }
   4593 
   4594 static void
   4595 micro_fseq(union tgsi_exec_channel *dst,
   4596            const union tgsi_exec_channel *src0,
   4597            const union tgsi_exec_channel *src1)
   4598 {
   4599    dst->u[0] = src0->f[0] == src1->f[0] ? ~0 : 0;
   4600    dst->u[1] = src0->f[1] == src1->f[1] ? ~0 : 0;
   4601    dst->u[2] = src0->f[2] == src1->f[2] ? ~0 : 0;
   4602    dst->u[3] = src0->f[3] == src1->f[3] ? ~0 : 0;
   4603 }
   4604 
   4605 static void
   4606 micro_fsge(union tgsi_exec_channel *dst,
   4607            const union tgsi_exec_channel *src0,
   4608            const union tgsi_exec_channel *src1)
   4609 {
   4610    dst->u[0] = src0->f[0] >= src1->f[0] ? ~0 : 0;
   4611    dst->u[1] = src0->f[1] >= src1->f[1] ? ~0 : 0;
   4612    dst->u[2] = src0->f[2] >= src1->f[2] ? ~0 : 0;
   4613    dst->u[3] = src0->f[3] >= src1->f[3] ? ~0 : 0;
   4614 }
   4615 
   4616 static void
   4617 micro_fslt(union tgsi_exec_channel *dst,
   4618            const union tgsi_exec_channel *src0,
   4619            const union tgsi_exec_channel *src1)
   4620 {
   4621    dst->u[0] = src0->f[0] < src1->f[0] ? ~0 : 0;
   4622    dst->u[1] = src0->f[1] < src1->f[1] ? ~0 : 0;
   4623    dst->u[2] = src0->f[2] < src1->f[2] ? ~0 : 0;
   4624    dst->u[3] = src0->f[3] < src1->f[3] ? ~0 : 0;
   4625 }
   4626 
   4627 static void
   4628 micro_fsne(union tgsi_exec_channel *dst,
   4629            const union tgsi_exec_channel *src0,
   4630            const union tgsi_exec_channel *src1)
   4631 {
   4632    dst->u[0] = src0->f[0] != src1->f[0] ? ~0 : 0;
   4633    dst->u[1] = src0->f[1] != src1->f[1] ? ~0 : 0;
   4634    dst->u[2] = src0->f[2] != src1->f[2] ? ~0 : 0;
   4635    dst->u[3] = src0->f[3] != src1->f[3] ? ~0 : 0;
   4636 }
   4637 
   4638 static void
   4639 micro_idiv(union tgsi_exec_channel *dst,
   4640            const union tgsi_exec_channel *src0,
   4641            const union tgsi_exec_channel *src1)
   4642 {
   4643    dst->i[0] = src1->i[0] ? src0->i[0] / src1->i[0] : 0;
   4644    dst->i[1] = src1->i[1] ? src0->i[1] / src1->i[1] : 0;
   4645    dst->i[2] = src1->i[2] ? src0->i[2] / src1->i[2] : 0;
   4646    dst->i[3] = src1->i[3] ? src0->i[3] / src1->i[3] : 0;
   4647 }
   4648 
   4649 static void
   4650 micro_imax(union tgsi_exec_channel *dst,
   4651            const union tgsi_exec_channel *src0,
   4652            const union tgsi_exec_channel *src1)
   4653 {
   4654    dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
   4655    dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
   4656    dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
   4657    dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
   4658 }
   4659 
   4660 static void
   4661 micro_imin(union tgsi_exec_channel *dst,
   4662            const union tgsi_exec_channel *src0,
   4663            const union tgsi_exec_channel *src1)
   4664 {
   4665    dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
   4666    dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
   4667    dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
   4668    dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
   4669 }
   4670 
   4671 static void
   4672 micro_isge(union tgsi_exec_channel *dst,
   4673            const union tgsi_exec_channel *src0,
   4674            const union tgsi_exec_channel *src1)
   4675 {
   4676    dst->i[0] = src0->i[0] >= src1->i[0] ? -1 : 0;
   4677    dst->i[1] = src0->i[1] >= src1->i[1] ? -1 : 0;
   4678    dst->i[2] = src0->i[2] >= src1->i[2] ? -1 : 0;
   4679    dst->i[3] = src0->i[3] >= src1->i[3] ? -1 : 0;
   4680 }
   4681 
   4682 static void
   4683 micro_ishr(union tgsi_exec_channel *dst,
   4684            const union tgsi_exec_channel *src0,
   4685            const union tgsi_exec_channel *src1)
   4686 {
   4687    unsigned masked_count;
   4688    masked_count = src1->i[0] & 0x1f;
   4689    dst->i[0] = src0->i[0] >> masked_count;
   4690    masked_count = src1->i[1] & 0x1f;
   4691    dst->i[1] = src0->i[1] >> masked_count;
   4692    masked_count = src1->i[2] & 0x1f;
   4693    dst->i[2] = src0->i[2] >> masked_count;
   4694    masked_count = src1->i[3] & 0x1f;
   4695    dst->i[3] = src0->i[3] >> masked_count;
   4696 }
   4697 
   4698 static void
   4699 micro_islt(union tgsi_exec_channel *dst,
   4700            const union tgsi_exec_channel *src0,
   4701            const union tgsi_exec_channel *src1)
   4702 {
   4703    dst->i[0] = src0->i[0] < src1->i[0] ? -1 : 0;
   4704    dst->i[1] = src0->i[1] < src1->i[1] ? -1 : 0;
   4705    dst->i[2] = src0->i[2] < src1->i[2] ? -1 : 0;
   4706    dst->i[3] = src0->i[3] < src1->i[3] ? -1 : 0;
   4707 }
   4708 
   4709 static void
   4710 micro_f2u(union tgsi_exec_channel *dst,
   4711           const union tgsi_exec_channel *src)
   4712 {
   4713    dst->u[0] = (uint)src->f[0];
   4714    dst->u[1] = (uint)src->f[1];
   4715    dst->u[2] = (uint)src->f[2];
   4716    dst->u[3] = (uint)src->f[3];
   4717 }
   4718 
   4719 static void
   4720 micro_u2f(union tgsi_exec_channel *dst,
   4721           const union tgsi_exec_channel *src)
   4722 {
   4723    dst->f[0] = (float)src->u[0];
   4724    dst->f[1] = (float)src->u[1];
   4725    dst->f[2] = (float)src->u[2];
   4726    dst->f[3] = (float)src->u[3];
   4727 }
   4728 
   4729 static void
   4730 micro_uadd(union tgsi_exec_channel *dst,
   4731            const union tgsi_exec_channel *src0,
   4732            const union tgsi_exec_channel *src1)
   4733 {
   4734    dst->u[0] = src0->u[0] + src1->u[0];
   4735    dst->u[1] = src0->u[1] + src1->u[1];
   4736    dst->u[2] = src0->u[2] + src1->u[2];
   4737    dst->u[3] = src0->u[3] + src1->u[3];
   4738 }
   4739 
   4740 static void
   4741 micro_udiv(union tgsi_exec_channel *dst,
   4742            const union tgsi_exec_channel *src0,
   4743            const union tgsi_exec_channel *src1)
   4744 {
   4745    dst->u[0] = src1->u[0] ? src0->u[0] / src1->u[0] : ~0u;
   4746    dst->u[1] = src1->u[1] ? src0->u[1] / src1->u[1] : ~0u;
   4747    dst->u[2] = src1->u[2] ? src0->u[2] / src1->u[2] : ~0u;
   4748    dst->u[3] = src1->u[3] ? src0->u[3] / src1->u[3] : ~0u;
   4749 }
   4750 
   4751 static void
   4752 micro_umad(union tgsi_exec_channel *dst,
   4753            const union tgsi_exec_channel *src0,
   4754            const union tgsi_exec_channel *src1,
   4755            const union tgsi_exec_channel *src2)
   4756 {
   4757    dst->u[0] = src0->u[0] * src1->u[0] + src2->u[0];
   4758    dst->u[1] = src0->u[1] * src1->u[1] + src2->u[1];
   4759    dst->u[2] = src0->u[2] * src1->u[2] + src2->u[2];
   4760    dst->u[3] = src0->u[3] * src1->u[3] + src2->u[3];
   4761 }
   4762 
   4763 static void
   4764 micro_umax(union tgsi_exec_channel *dst,
   4765            const union tgsi_exec_channel *src0,
   4766            const union tgsi_exec_channel *src1)
   4767 {
   4768    dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
   4769    dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
   4770    dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
   4771    dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
   4772 }
   4773 
   4774 static void
   4775 micro_umin(union tgsi_exec_channel *dst,
   4776            const union tgsi_exec_channel *src0,
   4777            const union tgsi_exec_channel *src1)
   4778 {
   4779    dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
   4780    dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
   4781    dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
   4782    dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
   4783 }
   4784 
   4785 static void
   4786 micro_umod(union tgsi_exec_channel *dst,
   4787            const union tgsi_exec_channel *src0,
   4788            const union tgsi_exec_channel *src1)
   4789 {
   4790    dst->u[0] = src1->u[0] ? src0->u[0] % src1->u[0] : ~0u;
   4791    dst->u[1] = src1->u[1] ? src0->u[1] % src1->u[1] : ~0u;
   4792    dst->u[2] = src1->u[2] ? src0->u[2] % src1->u[2] : ~0u;
   4793    dst->u[3] = src1->u[3] ? src0->u[3] % src1->u[3] : ~0u;
   4794 }
   4795 
   4796 static void
   4797 micro_umul(union tgsi_exec_channel *dst,
   4798            const union tgsi_exec_channel *src0,
   4799            const union tgsi_exec_channel *src1)
   4800 {
   4801    dst->u[0] = src0->u[0] * src1->u[0];
   4802    dst->u[1] = src0->u[1] * src1->u[1];
   4803    dst->u[2] = src0->u[2] * src1->u[2];
   4804    dst->u[3] = src0->u[3] * src1->u[3];
   4805 }
   4806 
   4807 static void
   4808 micro_imul_hi(union tgsi_exec_channel *dst,
   4809               const union tgsi_exec_channel *src0,
   4810               const union tgsi_exec_channel *src1)
   4811 {
   4812 #define I64M(x, y) ((((int64_t)x) * ((int64_t)y)) >> 32)
   4813    dst->i[0] = I64M(src0->i[0], src1->i[0]);
   4814    dst->i[1] = I64M(src0->i[1], src1->i[1]);
   4815    dst->i[2] = I64M(src0->i[2], src1->i[2]);
   4816    dst->i[3] = I64M(src0->i[3], src1->i[3]);
   4817 #undef I64M
   4818 }
   4819 
   4820 static void
   4821 micro_umul_hi(union tgsi_exec_channel *dst,
   4822               const union tgsi_exec_channel *src0,
   4823               const union tgsi_exec_channel *src1)
   4824 {
   4825 #define U64M(x, y) ((((uint64_t)x) * ((uint64_t)y)) >> 32)
   4826    dst->u[0] = U64M(src0->u[0], src1->u[0]);
   4827    dst->u[1] = U64M(src0->u[1], src1->u[1]);
   4828    dst->u[2] = U64M(src0->u[2], src1->u[2]);
   4829    dst->u[3] = U64M(src0->u[3], src1->u[3]);
   4830 #undef U64M
   4831 }
   4832 
   4833 static void
   4834 micro_useq(union tgsi_exec_channel *dst,
   4835            const union tgsi_exec_channel *src0,
   4836            const union tgsi_exec_channel *src1)
   4837 {
   4838    dst->u[0] = src0->u[0] == src1->u[0] ? ~0 : 0;
   4839    dst->u[1] = src0->u[1] == src1->u[1] ? ~0 : 0;
   4840    dst->u[2] = src0->u[2] == src1->u[2] ? ~0 : 0;
   4841    dst->u[3] = src0->u[3] == src1->u[3] ? ~0 : 0;
   4842 }
   4843 
   4844 static void
   4845 micro_usge(union tgsi_exec_channel *dst,
   4846            const union tgsi_exec_channel *src0,
   4847            const union tgsi_exec_channel *src1)
   4848 {
   4849    dst->u[0] = src0->u[0] >= src1->u[0] ? ~0 : 0;
   4850    dst->u[1] = src0->u[1] >= src1->u[1] ? ~0 : 0;
   4851    dst->u[2] = src0->u[2] >= src1->u[2] ? ~0 : 0;
   4852    dst->u[3] = src0->u[3] >= src1->u[3] ? ~0 : 0;
   4853 }
   4854 
   4855 static void
   4856 micro_ushr(union tgsi_exec_channel *dst,
   4857            const union tgsi_exec_channel *src0,
   4858            const union tgsi_exec_channel *src1)
   4859 {
   4860    unsigned masked_count;
   4861    masked_count = src1->u[0] & 0x1f;
   4862    dst->u[0] = src0->u[0] >> masked_count;
   4863    masked_count = src1->u[1] & 0x1f;
   4864    dst->u[1] = src0->u[1] >> masked_count;
   4865    masked_count = src1->u[2] & 0x1f;
   4866    dst->u[2] = src0->u[2] >> masked_count;
   4867    masked_count = src1->u[3] & 0x1f;
   4868    dst->u[3] = src0->u[3] >> masked_count;
   4869 }
   4870 
   4871 static void
   4872 micro_uslt(union tgsi_exec_channel *dst,
   4873            const union tgsi_exec_channel *src0,
   4874            const union tgsi_exec_channel *src1)
   4875 {
   4876    dst->u[0] = src0->u[0] < src1->u[0] ? ~0 : 0;
   4877    dst->u[1] = src0->u[1] < src1->u[1] ? ~0 : 0;
   4878    dst->u[2] = src0->u[2] < src1->u[2] ? ~0 : 0;
   4879    dst->u[3] = src0->u[3] < src1->u[3] ? ~0 : 0;
   4880 }
   4881 
   4882 static void
   4883 micro_usne(union tgsi_exec_channel *dst,
   4884            const union tgsi_exec_channel *src0,
   4885            const union tgsi_exec_channel *src1)
   4886 {
   4887    dst->u[0] = src0->u[0] != src1->u[0] ? ~0 : 0;
   4888    dst->u[1] = src0->u[1] != src1->u[1] ? ~0 : 0;
   4889    dst->u[2] = src0->u[2] != src1->u[2] ? ~0 : 0;
   4890    dst->u[3] = src0->u[3] != src1->u[3] ? ~0 : 0;
   4891 }
   4892 
   4893 static void
   4894 micro_uarl(union tgsi_exec_channel *dst,
   4895            const union tgsi_exec_channel *src)
   4896 {
   4897    dst->i[0] = src->u[0];
   4898    dst->i[1] = src->u[1];
   4899    dst->i[2] = src->u[2];
   4900    dst->i[3] = src->u[3];
   4901 }
   4902 
   4903 /**
   4904  * Signed bitfield extract (i.e. sign-extend the extracted bits)
   4905  */
   4906 static void
   4907 micro_ibfe(union tgsi_exec_channel *dst,
   4908            const union tgsi_exec_channel *src0,
   4909            const union tgsi_exec_channel *src1,
   4910            const union tgsi_exec_channel *src2)
   4911 {
   4912    int i;
   4913    for (i = 0; i < 4; i++) {
   4914       int width = src2->i[i] & 0x1f;
   4915       int offset = src1->i[i] & 0x1f;
   4916       if (width == 0)
   4917          dst->i[i] = 0;
   4918       else if (width + offset < 32)
   4919          dst->i[i] = (src0->i[i] << (32 - width - offset)) >> (32 - width);
   4920       else
   4921          dst->i[i] = src0->i[i] >> offset;
   4922    }
   4923 }
   4924 
   4925 /**
   4926  * Unsigned bitfield extract
   4927  */
   4928 static void
   4929 micro_ubfe(union tgsi_exec_channel *dst,
   4930            const union tgsi_exec_channel *src0,
   4931            const union tgsi_exec_channel *src1,
   4932            const union tgsi_exec_channel *src2)
   4933 {
   4934    int i;
   4935    for (i = 0; i < 4; i++) {
   4936       int width = src2->u[i] & 0x1f;
   4937       int offset = src1->u[i] & 0x1f;
   4938       if (width == 0)
   4939          dst->u[i] = 0;
   4940       else if (width + offset < 32)
   4941          dst->u[i] = (src0->u[i] << (32 - width - offset)) >> (32 - width);
   4942       else
   4943          dst->u[i] = src0->u[i] >> offset;
   4944    }
   4945 }
   4946 
   4947 /**
   4948  * Bitfield insert: copy low bits from src1 into a region of src0.
   4949  */
   4950 static void
   4951 micro_bfi(union tgsi_exec_channel *dst,
   4952           const union tgsi_exec_channel *src0,
   4953           const union tgsi_exec_channel *src1,
   4954           const union tgsi_exec_channel *src2,
   4955           const union tgsi_exec_channel *src3)
   4956 {
   4957    int i;
   4958    for (i = 0; i < 4; i++) {
   4959       int width = src3->u[i] & 0x1f;
   4960       int offset = src2->u[i] & 0x1f;
   4961       int bitmask = ((1 << width) - 1) << offset;
   4962       dst->u[i] = ((src1->u[i] << offset) & bitmask) | (src0->u[i] & ~bitmask);
   4963    }
   4964 }
   4965 
   4966 static void
   4967 micro_brev(union tgsi_exec_channel *dst,
   4968            const union tgsi_exec_channel *src)
   4969 {
   4970    dst->u[0] = util_bitreverse(src->u[0]);
   4971    dst->u[1] = util_bitreverse(src->u[1]);
   4972    dst->u[2] = util_bitreverse(src->u[2]);
   4973    dst->u[3] = util_bitreverse(src->u[3]);
   4974 }
   4975 
   4976 static void
   4977 micro_popc(union tgsi_exec_channel *dst,
   4978            const union tgsi_exec_channel *src)
   4979 {
   4980    dst->u[0] = util_bitcount(src->u[0]);
   4981    dst->u[1] = util_bitcount(src->u[1]);
   4982    dst->u[2] = util_bitcount(src->u[2]);
   4983    dst->u[3] = util_bitcount(src->u[3]);
   4984 }
   4985 
   4986 static void
   4987 micro_lsb(union tgsi_exec_channel *dst,
   4988           const union tgsi_exec_channel *src)
   4989 {
   4990    dst->i[0] = ffs(src->u[0]) - 1;
   4991    dst->i[1] = ffs(src->u[1]) - 1;
   4992    dst->i[2] = ffs(src->u[2]) - 1;
   4993    dst->i[3] = ffs(src->u[3]) - 1;
   4994 }
   4995 
   4996 static void
   4997 micro_imsb(union tgsi_exec_channel *dst,
   4998            const union tgsi_exec_channel *src)
   4999 {
   5000    dst->i[0] = util_last_bit_signed(src->i[0]) - 1;
   5001    dst->i[1] = util_last_bit_signed(src->i[1]) - 1;
   5002    dst->i[2] = util_last_bit_signed(src->i[2]) - 1;
   5003    dst->i[3] = util_last_bit_signed(src->i[3]) - 1;
   5004 }
   5005 
   5006 static void
   5007 micro_umsb(union tgsi_exec_channel *dst,
   5008            const union tgsi_exec_channel *src)
   5009 {
   5010    dst->i[0] = util_last_bit(src->u[0]) - 1;
   5011    dst->i[1] = util_last_bit(src->u[1]) - 1;
   5012    dst->i[2] = util_last_bit(src->u[2]) - 1;
   5013    dst->i[3] = util_last_bit(src->u[3]) - 1;
   5014 }
   5015 
   5016 /**
   5017  * Execute a TGSI instruction.
   5018  * Returns TRUE if a barrier instruction is hit,
   5019  * otherwise FALSE.
   5020  */
   5021 static boolean
   5022 exec_instruction(
   5023    struct tgsi_exec_machine *mach,
   5024    const struct tgsi_full_instruction *inst,
   5025    int *pc )
   5026 {
   5027    union tgsi_exec_channel r[10];
   5028 
   5029    (*pc)++;
   5030 
   5031    switch (inst->Instruction.Opcode) {
   5032    case TGSI_OPCODE_ARL:
   5033       exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
   5034       break;
   5035 
   5036    case TGSI_OPCODE_MOV:
   5037       exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
   5038       break;
   5039 
   5040    case TGSI_OPCODE_LIT:
   5041       exec_lit(mach, inst);
   5042       break;
   5043 
   5044    case TGSI_OPCODE_RCP:
   5045       exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5046       break;
   5047 
   5048    case TGSI_OPCODE_RSQ:
   5049       exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5050       break;
   5051 
   5052    case TGSI_OPCODE_EXP:
   5053       exec_exp(mach, inst);
   5054       break;
   5055 
   5056    case TGSI_OPCODE_LOG:
   5057       exec_log(mach, inst);
   5058       break;
   5059 
   5060    case TGSI_OPCODE_MUL:
   5061       exec_vector_binary(mach, inst, micro_mul, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5062       break;
   5063 
   5064    case TGSI_OPCODE_ADD:
   5065       exec_vector_binary(mach, inst, micro_add, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5066       break;
   5067 
   5068    case TGSI_OPCODE_DP3:
   5069       exec_dp3(mach, inst);
   5070       break;
   5071 
   5072    case TGSI_OPCODE_DP4:
   5073       exec_dp4(mach, inst);
   5074       break;
   5075 
   5076    case TGSI_OPCODE_DST:
   5077       exec_dst(mach, inst);
   5078       break;
   5079 
   5080    case TGSI_OPCODE_MIN:
   5081       exec_vector_binary(mach, inst, micro_min, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5082       break;
   5083 
   5084    case TGSI_OPCODE_MAX:
   5085       exec_vector_binary(mach, inst, micro_max, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5086       break;
   5087 
   5088    case TGSI_OPCODE_SLT:
   5089       exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5090       break;
   5091 
   5092    case TGSI_OPCODE_SGE:
   5093       exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5094       break;
   5095 
   5096    case TGSI_OPCODE_MAD:
   5097       exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5098       break;
   5099 
   5100    case TGSI_OPCODE_LRP:
   5101       exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5102       break;
   5103 
   5104    case TGSI_OPCODE_SQRT:
   5105       exec_scalar_unary(mach, inst, micro_sqrt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5106       break;
   5107 
   5108    case TGSI_OPCODE_FRC:
   5109       exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5110       break;
   5111 
   5112    case TGSI_OPCODE_FLR:
   5113       exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5114       break;
   5115 
   5116    case TGSI_OPCODE_ROUND:
   5117       exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5118       break;
   5119 
   5120    case TGSI_OPCODE_EX2:
   5121       exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5122       break;
   5123 
   5124    case TGSI_OPCODE_LG2:
   5125       exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5126       break;
   5127 
   5128    case TGSI_OPCODE_POW:
   5129       exec_scalar_binary(mach, inst, micro_pow, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5130       break;
   5131 
   5132    case TGSI_OPCODE_LDEXP:
   5133       exec_vector_binary(mach, inst, micro_ldexp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5134       break;
   5135 
   5136    case TGSI_OPCODE_COS:
   5137       exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5138       break;
   5139 
   5140    case TGSI_OPCODE_DDX:
   5141       exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5142       break;
   5143 
   5144    case TGSI_OPCODE_DDY:
   5145       exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5146       break;
   5147 
   5148    case TGSI_OPCODE_KILL:
   5149       exec_kill (mach, inst);
   5150       break;
   5151 
   5152    case TGSI_OPCODE_KILL_IF:
   5153       exec_kill_if (mach, inst);
   5154       break;
   5155 
   5156    case TGSI_OPCODE_PK2H:
   5157       exec_pk2h(mach, inst);
   5158       break;
   5159 
   5160    case TGSI_OPCODE_PK2US:
   5161       assert (0);
   5162       break;
   5163 
   5164    case TGSI_OPCODE_PK4B:
   5165       assert (0);
   5166       break;
   5167 
   5168    case TGSI_OPCODE_PK4UB:
   5169       assert (0);
   5170       break;
   5171 
   5172    case TGSI_OPCODE_SEQ:
   5173       exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5174       break;
   5175 
   5176    case TGSI_OPCODE_SGT:
   5177       exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5178       break;
   5179 
   5180    case TGSI_OPCODE_SIN:
   5181       exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5182       break;
   5183 
   5184    case TGSI_OPCODE_SLE:
   5185       exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5186       break;
   5187 
   5188    case TGSI_OPCODE_SNE:
   5189       exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5190       break;
   5191 
   5192    case TGSI_OPCODE_TEX:
   5193       /* simple texture lookup */
   5194       /* src[0] = texcoord */
   5195       /* src[1] = sampler unit */
   5196       exec_tex(mach, inst, TEX_MODIFIER_NONE, 1);
   5197       break;
   5198 
   5199    case TGSI_OPCODE_TXB:
   5200       /* Texture lookup with lod bias */
   5201       /* src[0] = texcoord (src[0].w = LOD bias) */
   5202       /* src[1] = sampler unit */
   5203       exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS, 1);
   5204       break;
   5205 
   5206    case TGSI_OPCODE_TXD:
   5207       /* Texture lookup with explict partial derivatives */
   5208       /* src[0] = texcoord */
   5209       /* src[1] = d[strq]/dx */
   5210       /* src[2] = d[strq]/dy */
   5211       /* src[3] = sampler unit */
   5212       exec_txd(mach, inst);
   5213       break;
   5214 
   5215    case TGSI_OPCODE_TXL:
   5216       /* Texture lookup with explit LOD */
   5217       /* src[0] = texcoord (src[0].w = LOD) */
   5218       /* src[1] = sampler unit */
   5219       exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, 1);
   5220       break;
   5221 
   5222    case TGSI_OPCODE_TXP:
   5223       /* Texture lookup with projection */
   5224       /* src[0] = texcoord (src[0].w = projection) */
   5225       /* src[1] = sampler unit */
   5226       exec_tex(mach, inst, TEX_MODIFIER_PROJECTED, 1);
   5227       break;
   5228 
   5229    case TGSI_OPCODE_TG4:
   5230       /* src[0] = texcoord */
   5231       /* src[1] = component */
   5232       /* src[2] = sampler unit */
   5233       exec_tex(mach, inst, TEX_MODIFIER_GATHER, 2);
   5234       break;
   5235 
   5236    case TGSI_OPCODE_LODQ:
   5237       /* src[0] = texcoord */
   5238       /* src[1] = sampler unit */
   5239       exec_lodq(mach, inst);
   5240       break;
   5241 
   5242    case TGSI_OPCODE_UP2H:
   5243       exec_up2h(mach, inst);
   5244       break;
   5245 
   5246    case TGSI_OPCODE_UP2US:
   5247       assert (0);
   5248       break;
   5249 
   5250    case TGSI_OPCODE_UP4B:
   5251       assert (0);
   5252       break;
   5253 
   5254    case TGSI_OPCODE_UP4UB:
   5255       assert (0);
   5256       break;
   5257 
   5258    case TGSI_OPCODE_ARR:
   5259       exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
   5260       break;
   5261 
   5262    case TGSI_OPCODE_CAL:
   5263       /* skip the call if no execution channels are enabled */
   5264       if (mach->ExecMask) {
   5265          /* do the call */
   5266 
   5267          /* First, record the depths of the execution stacks.
   5268           * This is important for deeply nested/looped return statements.
   5269           * We have to unwind the stacks by the correct amount.  For a
   5270           * real code generator, we could determine the number of entries
   5271           * to pop off each stack with simple static analysis and avoid
   5272           * implementing this data structure at run time.
   5273           */
   5274          mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
   5275          mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
   5276          mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
   5277          mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
   5278          mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
   5279          /* note that PC was already incremented above */
   5280          mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
   5281 
   5282          mach->CallStackTop++;
   5283 
   5284          /* Second, push the Cond, Loop, Cont, Func stacks */
   5285          assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
   5286          assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
   5287          assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
   5288          assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
   5289          assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
   5290          assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
   5291 
   5292          mach->CondStack[mach->CondStackTop++] = mach->CondMask;
   5293          mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
   5294          mach->ContStack[mach->ContStackTop++] = mach->ContMask;
   5295          mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
   5296          mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
   5297          mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
   5298 
   5299          /* Finally, jump to the subroutine.  The label is a pointer
   5300           * (an instruction number) to the BGNSUB instruction.
   5301           */
   5302          *pc = inst->Label.Label;
   5303          assert(mach->Instructions[*pc].Instruction.Opcode
   5304                 == TGSI_OPCODE_BGNSUB);
   5305       }
   5306       break;
   5307 
   5308    case TGSI_OPCODE_RET:
   5309       mach->FuncMask &= ~mach->ExecMask;
   5310       UPDATE_EXEC_MASK(mach);
   5311 
   5312       if (mach->FuncMask == 0x0) {
   5313          /* really return now (otherwise, keep executing */
   5314 
   5315          if (mach->CallStackTop == 0) {
   5316             /* returning from main() */
   5317             mach->CondStackTop = 0;
   5318             mach->LoopStackTop = 0;
   5319             mach->ContStackTop = 0;
   5320             mach->LoopLabelStackTop = 0;
   5321             mach->SwitchStackTop = 0;
   5322             mach->BreakStackTop = 0;
   5323             *pc = -1;
   5324             return FALSE;
   5325          }
   5326 
   5327          assert(mach->CallStackTop > 0);
   5328          mach->CallStackTop--;
   5329 
   5330          mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
   5331          mach->CondMask = mach->CondStack[mach->CondStackTop];
   5332 
   5333          mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
   5334          mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
   5335 
   5336          mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
   5337          mach->ContMask = mach->ContStack[mach->ContStackTop];
   5338 
   5339          mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
   5340          mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
   5341 
   5342          mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
   5343          mach->BreakType = mach->BreakStack[mach->BreakStackTop];
   5344 
   5345          assert(mach->FuncStackTop > 0);
   5346          mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
   5347 
   5348          *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
   5349 
   5350          UPDATE_EXEC_MASK(mach);
   5351       }
   5352       break;
   5353 
   5354    case TGSI_OPCODE_SSG:
   5355       exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5356       break;
   5357 
   5358    case TGSI_OPCODE_CMP:
   5359       exec_vector_trinary(mach, inst, micro_cmp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5360       break;
   5361 
   5362    case TGSI_OPCODE_DIV:
   5363       exec_vector_binary(mach, inst, micro_div, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5364       break;
   5365 
   5366    case TGSI_OPCODE_DP2:
   5367       exec_dp2(mach, inst);
   5368       break;
   5369 
   5370    case TGSI_OPCODE_IF:
   5371       /* push CondMask */
   5372       assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
   5373       mach->CondStack[mach->CondStackTop++] = mach->CondMask;
   5374       FETCH( &r[0], 0, TGSI_CHAN_X );
   5375       /* update CondMask */
   5376       if( ! r[0].f[0] ) {
   5377          mach->CondMask &= ~0x1;
   5378       }
   5379       if( ! r[0].f[1] ) {
   5380          mach->CondMask &= ~0x2;
   5381       }
   5382       if( ! r[0].f[2] ) {
   5383          mach->CondMask &= ~0x4;
   5384       }
   5385       if( ! r[0].f[3] ) {
   5386          mach->CondMask &= ~0x8;
   5387       }
   5388       UPDATE_EXEC_MASK(mach);
   5389       /* Todo: If CondMask==0, jump to ELSE */
   5390       break;
   5391 
   5392    case TGSI_OPCODE_UIF:
   5393       /* push CondMask */
   5394       assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
   5395       mach->CondStack[mach->CondStackTop++] = mach->CondMask;
   5396       IFETCH( &r[0], 0, TGSI_CHAN_X );
   5397       /* update CondMask */
   5398       if( ! r[0].u[0] ) {
   5399          mach->CondMask &= ~0x1;
   5400       }
   5401       if( ! r[0].u[1] ) {
   5402          mach->CondMask &= ~0x2;
   5403       }
   5404       if( ! r[0].u[2] ) {
   5405          mach->CondMask &= ~0x4;
   5406       }
   5407       if( ! r[0].u[3] ) {
   5408          mach->CondMask &= ~0x8;
   5409       }
   5410       UPDATE_EXEC_MASK(mach);
   5411       /* Todo: If CondMask==0, jump to ELSE */
   5412       break;
   5413 
   5414    case TGSI_OPCODE_ELSE:
   5415       /* invert CondMask wrt previous mask */
   5416       {
   5417          uint prevMask;
   5418          assert(mach->CondStackTop > 0);
   5419          prevMask = mach->CondStack[mach->CondStackTop - 1];
   5420          mach->CondMask = ~mach->CondMask & prevMask;
   5421          UPDATE_EXEC_MASK(mach);
   5422          /* Todo: If CondMask==0, jump to ENDIF */
   5423       }
   5424       break;
   5425 
   5426    case TGSI_OPCODE_ENDIF:
   5427       /* pop CondMask */
   5428       assert(mach->CondStackTop > 0);
   5429       mach->CondMask = mach->CondStack[--mach->CondStackTop];
   5430       UPDATE_EXEC_MASK(mach);
   5431       break;
   5432 
   5433    case TGSI_OPCODE_END:
   5434       /* make sure we end primitives which haven't
   5435        * been explicitly emitted */
   5436       conditional_emit_primitive(mach);
   5437       /* halt execution */
   5438       *pc = -1;
   5439       break;
   5440 
   5441    case TGSI_OPCODE_CEIL:
   5442       exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5443       break;
   5444 
   5445    case TGSI_OPCODE_I2F:
   5446       exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
   5447       break;
   5448 
   5449    case TGSI_OPCODE_NOT:
   5450       exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5451       break;
   5452 
   5453    case TGSI_OPCODE_TRUNC:
   5454       exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
   5455       break;
   5456 
   5457    case TGSI_OPCODE_SHL:
   5458       exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5459       break;
   5460 
   5461    case TGSI_OPCODE_AND:
   5462       exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5463       break;
   5464 
   5465    case TGSI_OPCODE_OR:
   5466       exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5467       break;
   5468 
   5469    case TGSI_OPCODE_MOD:
   5470       exec_vector_binary(mach, inst, micro_mod, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5471       break;
   5472 
   5473    case TGSI_OPCODE_XOR:
   5474       exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5475       break;
   5476 
   5477    case TGSI_OPCODE_TXF:
   5478       exec_txf(mach, inst);
   5479       break;
   5480 
   5481    case TGSI_OPCODE_TXQ:
   5482       exec_txq(mach, inst);
   5483       break;
   5484 
   5485    case TGSI_OPCODE_EMIT:
   5486       emit_vertex(mach);
   5487       break;
   5488 
   5489    case TGSI_OPCODE_ENDPRIM:
   5490       emit_primitive(mach);
   5491       break;
   5492 
   5493    case TGSI_OPCODE_BGNLOOP:
   5494       /* push LoopMask and ContMasks */
   5495       assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
   5496       assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
   5497       assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
   5498       assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
   5499 
   5500       mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
   5501       mach->ContStack[mach->ContStackTop++] = mach->ContMask;
   5502       mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
   5503       mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
   5504       mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
   5505       break;
   5506 
   5507    case TGSI_OPCODE_ENDLOOP:
   5508       /* Restore ContMask, but don't pop */
   5509       assert(mach->ContStackTop > 0);
   5510       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
   5511       UPDATE_EXEC_MASK(mach);
   5512       if (mach->ExecMask) {
   5513          /* repeat loop: jump to instruction just past BGNLOOP */
   5514          assert(mach->LoopLabelStackTop > 0);
   5515          *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
   5516       }
   5517       else {
   5518          /* exit loop: pop LoopMask */
   5519          assert(mach->LoopStackTop > 0);
   5520          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
   5521          /* pop ContMask */
   5522          assert(mach->ContStackTop > 0);
   5523          mach->ContMask = mach->ContStack[--mach->ContStackTop];
   5524          assert(mach->LoopLabelStackTop > 0);
   5525          --mach->LoopLabelStackTop;
   5526 
   5527          mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
   5528       }
   5529       UPDATE_EXEC_MASK(mach);
   5530       break;
   5531 
   5532    case TGSI_OPCODE_BRK:
   5533       exec_break(mach);
   5534       break;
   5535 
   5536    case TGSI_OPCODE_CONT:
   5537       /* turn off cont channels for each enabled exec channel */
   5538       mach->ContMask &= ~mach->ExecMask;
   5539       /* Todo: if mach->LoopMask == 0, jump to end of loop */
   5540       UPDATE_EXEC_MASK(mach);
   5541       break;
   5542 
   5543    case TGSI_OPCODE_BGNSUB:
   5544       /* no-op */
   5545       break;
   5546 
   5547    case TGSI_OPCODE_ENDSUB:
   5548       /*
   5549        * XXX: This really should be a no-op. We should never reach this opcode.
   5550        */
   5551 
   5552       assert(mach->CallStackTop > 0);
   5553       mach->CallStackTop--;
   5554 
   5555       mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
   5556       mach->CondMask = mach->CondStack[mach->CondStackTop];
   5557 
   5558       mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
   5559       mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
   5560 
   5561       mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
   5562       mach->ContMask = mach->ContStack[mach->ContStackTop];
   5563 
   5564       mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
   5565       mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
   5566 
   5567       mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
   5568       mach->BreakType = mach->BreakStack[mach->BreakStackTop];
   5569 
   5570       assert(mach->FuncStackTop > 0);
   5571       mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
   5572 
   5573       *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
   5574 
   5575       UPDATE_EXEC_MASK(mach);
   5576       break;
   5577 
   5578    case TGSI_OPCODE_NOP:
   5579       break;
   5580 
   5581    case TGSI_OPCODE_F2I:
   5582       exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
   5583       break;
   5584 
   5585    case TGSI_OPCODE_FSEQ:
   5586       exec_vector_binary(mach, inst, micro_fseq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
   5587       break;
   5588 
   5589    case TGSI_OPCODE_FSGE:
   5590       exec_vector_binary(mach, inst, micro_fsge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
   5591       break;
   5592 
   5593    case TGSI_OPCODE_FSLT:
   5594       exec_vector_binary(mach, inst, micro_fslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
   5595       break;
   5596 
   5597    case TGSI_OPCODE_FSNE:
   5598       exec_vector_binary(mach, inst, micro_fsne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
   5599       break;
   5600 
   5601    case TGSI_OPCODE_IDIV:
   5602       exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5603       break;
   5604 
   5605    case TGSI_OPCODE_IMAX:
   5606       exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5607       break;
   5608 
   5609    case TGSI_OPCODE_IMIN:
   5610       exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5611       break;
   5612 
   5613    case TGSI_OPCODE_INEG:
   5614       exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5615       break;
   5616 
   5617    case TGSI_OPCODE_ISGE:
   5618       exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5619       break;
   5620 
   5621    case TGSI_OPCODE_ISHR:
   5622       exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5623       break;
   5624 
   5625    case TGSI_OPCODE_ISLT:
   5626       exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5627       break;
   5628 
   5629    case TGSI_OPCODE_F2U:
   5630       exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
   5631       break;
   5632 
   5633    case TGSI_OPCODE_U2F:
   5634       exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
   5635       break;
   5636 
   5637    case TGSI_OPCODE_UADD:
   5638       exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5639       break;
   5640 
   5641    case TGSI_OPCODE_UDIV:
   5642       exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5643       break;
   5644 
   5645    case TGSI_OPCODE_UMAD:
   5646       exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5647       break;
   5648 
   5649    case TGSI_OPCODE_UMAX:
   5650       exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5651       break;
   5652 
   5653    case TGSI_OPCODE_UMIN:
   5654       exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5655       break;
   5656 
   5657    case TGSI_OPCODE_UMOD:
   5658       exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5659       break;
   5660 
   5661    case TGSI_OPCODE_UMUL:
   5662       exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5663       break;
   5664 
   5665    case TGSI_OPCODE_IMUL_HI:
   5666       exec_vector_binary(mach, inst, micro_imul_hi, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5667       break;
   5668 
   5669    case TGSI_OPCODE_UMUL_HI:
   5670       exec_vector_binary(mach, inst, micro_umul_hi, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5671       break;
   5672 
   5673    case TGSI_OPCODE_USEQ:
   5674       exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5675       break;
   5676 
   5677    case TGSI_OPCODE_USGE:
   5678       exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5679       break;
   5680 
   5681    case TGSI_OPCODE_USHR:
   5682       exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5683       break;
   5684 
   5685    case TGSI_OPCODE_USLT:
   5686       exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5687       break;
   5688 
   5689    case TGSI_OPCODE_USNE:
   5690       exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5691       break;
   5692 
   5693    case TGSI_OPCODE_SWITCH:
   5694       exec_switch(mach, inst);
   5695       break;
   5696 
   5697    case TGSI_OPCODE_CASE:
   5698       exec_case(mach, inst);
   5699       break;
   5700 
   5701    case TGSI_OPCODE_DEFAULT:
   5702       exec_default(mach);
   5703       break;
   5704 
   5705    case TGSI_OPCODE_ENDSWITCH:
   5706       exec_endswitch(mach);
   5707       break;
   5708 
   5709    case TGSI_OPCODE_SAMPLE_I:
   5710       exec_txf(mach, inst);
   5711       break;
   5712 
   5713    case TGSI_OPCODE_SAMPLE_I_MS:
   5714       exec_txf(mach, inst);
   5715       break;
   5716 
   5717    case TGSI_OPCODE_SAMPLE:
   5718       exec_sample(mach, inst, TEX_MODIFIER_NONE, FALSE);
   5719       break;
   5720 
   5721    case TGSI_OPCODE_SAMPLE_B:
   5722       exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS, FALSE);
   5723       break;
   5724 
   5725    case TGSI_OPCODE_SAMPLE_C:
   5726       exec_sample(mach, inst, TEX_MODIFIER_NONE, TRUE);
   5727       break;
   5728 
   5729    case TGSI_OPCODE_SAMPLE_C_LZ:
   5730       exec_sample(mach, inst, TEX_MODIFIER_LEVEL_ZERO, TRUE);
   5731       break;
   5732 
   5733    case TGSI_OPCODE_SAMPLE_D:
   5734       exec_sample_d(mach, inst);
   5735       break;
   5736 
   5737    case TGSI_OPCODE_SAMPLE_L:
   5738       exec_sample(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, FALSE);
   5739       break;
   5740 
   5741    case TGSI_OPCODE_GATHER4:
   5742       exec_sample(mach, inst, TEX_MODIFIER_GATHER, FALSE);
   5743       break;
   5744 
   5745    case TGSI_OPCODE_SVIEWINFO:
   5746       exec_txq(mach, inst);
   5747       break;
   5748 
   5749    case TGSI_OPCODE_SAMPLE_POS:
   5750       assert(0);
   5751       break;
   5752 
   5753    case TGSI_OPCODE_SAMPLE_INFO:
   5754       assert(0);
   5755       break;
   5756 
   5757    case TGSI_OPCODE_LOD:
   5758       exec_lodq(mach, inst);
   5759       break;
   5760 
   5761    case TGSI_OPCODE_UARL:
   5762       exec_vector_unary(mach, inst, micro_uarl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
   5763       break;
   5764 
   5765    case TGSI_OPCODE_UCMP:
   5766       exec_ucmp(mach, inst);
   5767       break;
   5768 
   5769    case TGSI_OPCODE_IABS:
   5770       exec_vector_unary(mach, inst, micro_iabs, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5771       break;
   5772 
   5773    case TGSI_OPCODE_ISSG:
   5774       exec_vector_unary(mach, inst, micro_isgn, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5775       break;
   5776 
   5777    case TGSI_OPCODE_TEX2:
   5778       /* simple texture lookup */
   5779       /* src[0] = texcoord */
   5780       /* src[1] = compare */
   5781       /* src[2] = sampler unit */
   5782       exec_tex(mach, inst, TEX_MODIFIER_NONE, 2);
   5783       break;
   5784    case TGSI_OPCODE_TXB2:
   5785       /* simple texture lookup */
   5786       /* src[0] = texcoord */
   5787       /* src[1] = bias */
   5788       /* src[2] = sampler unit */
   5789       exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS, 2);
   5790       break;
   5791    case TGSI_OPCODE_TXL2:
   5792       /* simple texture lookup */
   5793       /* src[0] = texcoord */
   5794       /* src[1] = lod */
   5795       /* src[2] = sampler unit */
   5796       exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, 2);
   5797       break;
   5798 
   5799    case TGSI_OPCODE_IBFE:
   5800       exec_vector_trinary(mach, inst, micro_ibfe, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5801       break;
   5802    case TGSI_OPCODE_UBFE:
   5803       exec_vector_trinary(mach, inst, micro_ubfe, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5804       break;
   5805    case TGSI_OPCODE_BFI:
   5806       exec_vector_quaternary(mach, inst, micro_bfi, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5807       break;
   5808    case TGSI_OPCODE_BREV:
   5809       exec_vector_unary(mach, inst, micro_brev, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5810       break;
   5811    case TGSI_OPCODE_POPC:
   5812       exec_vector_unary(mach, inst, micro_popc, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
   5813       break;
   5814    case TGSI_OPCODE_LSB:
   5815       exec_vector_unary(mach, inst, micro_lsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
   5816       break;
   5817    case TGSI_OPCODE_IMSB:
   5818       exec_vector_unary(mach, inst, micro_imsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
   5819       break;
   5820    case TGSI_OPCODE_UMSB:
   5821       exec_vector_unary(mach, inst, micro_umsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
   5822       break;
   5823 
   5824    case TGSI_OPCODE_F2D:
   5825       exec_t_2_64(mach, inst, micro_f2d, TGSI_EXEC_DATA_FLOAT);
   5826       break;
   5827 
   5828    case TGSI_OPCODE_D2F:
   5829       exec_64_2_t(mach, inst, micro_d2f, TGSI_EXEC_DATA_FLOAT);
   5830       break;
   5831 
   5832    case TGSI_OPCODE_DABS:
   5833       exec_double_unary(mach, inst, micro_dabs);
   5834       break;
   5835 
   5836    case TGSI_OPCODE_DNEG:
   5837       exec_double_unary(mach, inst, micro_dneg);
   5838       break;
   5839 
   5840    case TGSI_OPCODE_DADD:
   5841       exec_double_binary(mach, inst, micro_dadd, TGSI_EXEC_DATA_DOUBLE);
   5842       break;
   5843 
   5844    case TGSI_OPCODE_DDIV:
   5845       exec_double_binary(mach, inst, micro_ddiv, TGSI_EXEC_DATA_DOUBLE);
   5846       break;
   5847 
   5848    case TGSI_OPCODE_DMUL:
   5849       exec_double_binary(mach, inst, micro_dmul, TGSI_EXEC_DATA_DOUBLE);
   5850       break;
   5851 
   5852    case TGSI_OPCODE_DMAX:
   5853       exec_double_binary(mach, inst, micro_dmax, TGSI_EXEC_DATA_DOUBLE);
   5854       break;
   5855 
   5856    case TGSI_OPCODE_DMIN:
   5857       exec_double_binary(mach, inst, micro_dmin, TGSI_EXEC_DATA_DOUBLE);
   5858       break;
   5859 
   5860    case TGSI_OPCODE_DSLT:
   5861       exec_double_binary(mach, inst, micro_dslt, TGSI_EXEC_DATA_UINT);
   5862       break;
   5863 
   5864    case TGSI_OPCODE_DSGE:
   5865       exec_double_binary(mach, inst, micro_dsge, TGSI_EXEC_DATA_UINT);
   5866       break;
   5867 
   5868    case TGSI_OPCODE_DSEQ:
   5869       exec_double_binary(mach, inst, micro_dseq, TGSI_EXEC_DATA_UINT);
   5870       break;
   5871 
   5872    case TGSI_OPCODE_DSNE:
   5873       exec_double_binary(mach, inst, micro_dsne, TGSI_EXEC_DATA_UINT);
   5874       break;
   5875 
   5876    case TGSI_OPCODE_DRCP:
   5877       exec_double_unary(mach, inst, micro_drcp);
   5878       break;
   5879 
   5880    case TGSI_OPCODE_DSQRT:
   5881       exec_double_unary(mach, inst, micro_dsqrt);
   5882       break;
   5883 
   5884    case TGSI_OPCODE_DRSQ:
   5885       exec_double_unary(mach, inst, micro_drsq);
   5886       break;
   5887 
   5888    case TGSI_OPCODE_DMAD:
   5889       exec_double_trinary(mach, inst, micro_dmad);
   5890       break;
   5891 
   5892    case TGSI_OPCODE_DFRAC:
   5893       exec_double_unary(mach, inst, micro_dfrac);
   5894       break;
   5895 
   5896    case TGSI_OPCODE_DLDEXP:
   5897       exec_dldexp(mach, inst);
   5898       break;
   5899 
   5900    case TGSI_OPCODE_DFRACEXP:
   5901       exec_dfracexp(mach, inst);
   5902       break;
   5903 
   5904    case TGSI_OPCODE_I2D:
   5905       exec_t_2_64(mach, inst, micro_i2d, TGSI_EXEC_DATA_INT);
   5906       break;
   5907 
   5908    case TGSI_OPCODE_D2I:
   5909       exec_64_2_t(mach, inst, micro_d2i, TGSI_EXEC_DATA_INT);
   5910       break;
   5911 
   5912    case TGSI_OPCODE_U2D:
   5913       exec_t_2_64(mach, inst, micro_u2d, TGSI_EXEC_DATA_UINT);
   5914       break;
   5915 
   5916    case TGSI_OPCODE_D2U:
   5917       exec_64_2_t(mach, inst, micro_d2u, TGSI_EXEC_DATA_INT);
   5918       break;
   5919 
   5920    case TGSI_OPCODE_LOAD:
   5921       exec_load(mach, inst);
   5922       break;
   5923 
   5924    case TGSI_OPCODE_STORE:
   5925       exec_store(mach, inst);
   5926       break;
   5927 
   5928    case TGSI_OPCODE_ATOMUADD:
   5929    case TGSI_OPCODE_ATOMXCHG:
   5930    case TGSI_OPCODE_ATOMCAS:
   5931    case TGSI_OPCODE_ATOMAND:
   5932    case TGSI_OPCODE_ATOMOR:
   5933    case TGSI_OPCODE_ATOMXOR:
   5934    case TGSI_OPCODE_ATOMUMIN:
   5935    case TGSI_OPCODE_ATOMUMAX:
   5936    case TGSI_OPCODE_ATOMIMIN:
   5937    case TGSI_OPCODE_ATOMIMAX:
   5938       exec_atomop(mach, inst);
   5939       break;
   5940 
   5941    case TGSI_OPCODE_RESQ:
   5942       exec_resq(mach, inst);
   5943       break;
   5944    case TGSI_OPCODE_BARRIER:
   5945    case TGSI_OPCODE_MEMBAR:
   5946       return TRUE;
   5947       break;
   5948 
   5949    case TGSI_OPCODE_I64ABS:
   5950       exec_double_unary(mach, inst, micro_i64abs);
   5951       break;
   5952 
   5953    case TGSI_OPCODE_I64SSG:
   5954       exec_double_unary(mach, inst, micro_i64sgn);
   5955       break;
   5956 
   5957    case TGSI_OPCODE_I64NEG:
   5958       exec_double_unary(mach, inst, micro_i64neg);
   5959       break;
   5960 
   5961    case TGSI_OPCODE_U64SEQ:
   5962       exec_double_binary(mach, inst, micro_u64seq, TGSI_EXEC_DATA_UINT);
   5963       break;
   5964 
   5965    case TGSI_OPCODE_U64SNE:
   5966       exec_double_binary(mach, inst, micro_u64sne, TGSI_EXEC_DATA_UINT);
   5967       break;
   5968 
   5969    case TGSI_OPCODE_I64SLT:
   5970       exec_double_binary(mach, inst, micro_i64slt, TGSI_EXEC_DATA_UINT);
   5971       break;
   5972    case TGSI_OPCODE_U64SLT:
   5973       exec_double_binary(mach, inst, micro_u64slt, TGSI_EXEC_DATA_UINT);
   5974       break;
   5975 
   5976    case TGSI_OPCODE_I64SGE:
   5977       exec_double_binary(mach, inst, micro_i64sge, TGSI_EXEC_DATA_UINT);
   5978       break;
   5979    case TGSI_OPCODE_U64SGE:
   5980       exec_double_binary(mach, inst, micro_u64sge, TGSI_EXEC_DATA_UINT);
   5981       break;
   5982 
   5983    case TGSI_OPCODE_I64MIN:
   5984       exec_double_binary(mach, inst, micro_i64min, TGSI_EXEC_DATA_INT64);
   5985       break;
   5986    case TGSI_OPCODE_U64MIN:
   5987       exec_double_binary(mach, inst, micro_u64min, TGSI_EXEC_DATA_UINT64);
   5988       break;
   5989    case TGSI_OPCODE_I64MAX:
   5990       exec_double_binary(mach, inst, micro_i64max, TGSI_EXEC_DATA_INT64);
   5991       break;
   5992    case TGSI_OPCODE_U64MAX:
   5993       exec_double_binary(mach, inst, micro_u64max, TGSI_EXEC_DATA_UINT64);
   5994       break;
   5995    case TGSI_OPCODE_U64ADD:
   5996       exec_double_binary(mach, inst, micro_u64add, TGSI_EXEC_DATA_UINT64);
   5997       break;
   5998    case TGSI_OPCODE_U64MUL:
   5999       exec_double_binary(mach, inst, micro_u64mul, TGSI_EXEC_DATA_UINT64);
   6000       break;
   6001    case TGSI_OPCODE_U64SHL:
   6002       exec_arg0_64_arg1_32(mach, inst, micro_u64shl);
   6003       break;
   6004    case TGSI_OPCODE_I64SHR:
   6005       exec_arg0_64_arg1_32(mach, inst, micro_i64shr);
   6006       break;
   6007    case TGSI_OPCODE_U64SHR:
   6008       exec_arg0_64_arg1_32(mach, inst, micro_u64shr);
   6009       break;
   6010    case TGSI_OPCODE_U64DIV:
   6011       exec_double_binary(mach, inst, micro_u64div, TGSI_EXEC_DATA_UINT64);
   6012       break;
   6013    case TGSI_OPCODE_I64DIV:
   6014       exec_double_binary(mach, inst, micro_i64div, TGSI_EXEC_DATA_INT64);
   6015       break;
   6016    case TGSI_OPCODE_U64MOD:
   6017       exec_double_binary(mach, inst, micro_u64mod, TGSI_EXEC_DATA_UINT64);
   6018       break;
   6019    case TGSI_OPCODE_I64MOD:
   6020       exec_double_binary(mach, inst, micro_i64mod, TGSI_EXEC_DATA_INT64);
   6021       break;
   6022 
   6023    case TGSI_OPCODE_F2U64:
   6024       exec_t_2_64(mach, inst, micro_f2u64, TGSI_EXEC_DATA_FLOAT);
   6025       break;
   6026 
   6027    case TGSI_OPCODE_F2I64:
   6028       exec_t_2_64(mach, inst, micro_f2i64, TGSI_EXEC_DATA_FLOAT);
   6029       break;
   6030 
   6031    case TGSI_OPCODE_U2I64:
   6032       exec_t_2_64(mach, inst, micro_u2i64, TGSI_EXEC_DATA_INT);
   6033       break;
   6034    case TGSI_OPCODE_I2I64:
   6035       exec_t_2_64(mach, inst, micro_i2i64, TGSI_EXEC_DATA_INT);
   6036       break;
   6037 
   6038    case TGSI_OPCODE_D2U64:
   6039       exec_double_unary(mach, inst, micro_d2u64);
   6040       break;
   6041 
   6042    case TGSI_OPCODE_D2I64:
   6043       exec_double_unary(mach, inst, micro_d2i64);
   6044       break;
   6045 
   6046    case TGSI_OPCODE_U642F:
   6047       exec_64_2_t(mach, inst, micro_u642f, TGSI_EXEC_DATA_FLOAT);
   6048       break;
   6049    case TGSI_OPCODE_I642F:
   6050       exec_64_2_t(mach, inst, micro_i642f, TGSI_EXEC_DATA_FLOAT);
   6051       break;
   6052 
   6053    case TGSI_OPCODE_U642D:
   6054       exec_double_unary(mach, inst, micro_u642d);
   6055       break;
   6056    case TGSI_OPCODE_I642D:
   6057       exec_double_unary(mach, inst, micro_i642d);
   6058       break;
   6059 
   6060    default:
   6061       assert( 0 );
   6062    }
   6063    return FALSE;
   6064 }
   6065 
   6066 static void
   6067 tgsi_exec_machine_setup_masks(struct tgsi_exec_machine *mach)
   6068 {
   6069    uint default_mask = 0xf;
   6070 
   6071    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
   6072    mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
   6073 
   6074    if (mach->ShaderType == PIPE_SHADER_GEOMETRY) {
   6075       mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
   6076       mach->Primitives[0] = 0;
   6077       /* GS runs on a single primitive for now */
   6078       default_mask = 0x1;
   6079    }
   6080 
   6081    if (mach->NonHelperMask == 0)
   6082       mach->NonHelperMask = default_mask;
   6083    mach->CondMask = default_mask;
   6084    mach->LoopMask = default_mask;
   6085    mach->ContMask = default_mask;
   6086    mach->FuncMask = default_mask;
   6087    mach->ExecMask = default_mask;
   6088 
   6089    mach->Switch.mask = default_mask;
   6090 
   6091    assert(mach->CondStackTop == 0);
   6092    assert(mach->LoopStackTop == 0);
   6093    assert(mach->ContStackTop == 0);
   6094    assert(mach->SwitchStackTop == 0);
   6095    assert(mach->BreakStackTop == 0);
   6096    assert(mach->CallStackTop == 0);
   6097 }
   6098 
   6099 /**
   6100  * Run TGSI interpreter.
   6101  * \return bitmask of "alive" quad components
   6102  */
   6103 uint
   6104 tgsi_exec_machine_run( struct tgsi_exec_machine *mach, int start_pc )
   6105 {
   6106    uint i;
   6107 
   6108    mach->pc = start_pc;
   6109 
   6110    if (!start_pc) {
   6111       tgsi_exec_machine_setup_masks(mach);
   6112 
   6113       /* execute declarations (interpolants) */
   6114       for (i = 0; i < mach->NumDeclarations; i++) {
   6115          exec_declaration( mach, mach->Declarations+i );
   6116       }
   6117    }
   6118 
   6119    {
   6120 #if DEBUG_EXECUTION
   6121       struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
   6122       struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
   6123       uint inst = 1;
   6124 
   6125       if (!start_pc) {
   6126          memset(mach->Temps, 0, sizeof(temps));
   6127          if (mach->Outputs)
   6128             memset(mach->Outputs, 0, sizeof(outputs));
   6129          memset(temps, 0, sizeof(temps));
   6130          memset(outputs, 0, sizeof(outputs));
   6131       }
   6132 #endif
   6133 
   6134       /* execute instructions, until pc is set to -1 */
   6135       while (mach->pc != -1) {
   6136          boolean barrier_hit;
   6137 #if DEBUG_EXECUTION
   6138          uint i;
   6139 
   6140          tgsi_dump_instruction(&mach->Instructions[mach->pc], inst++);
   6141 #endif
   6142 
   6143          assert(mach->pc < (int) mach->NumInstructions);
   6144          barrier_hit = exec_instruction(mach, mach->Instructions + mach->pc, &mach->pc);
   6145 
   6146          /* for compute shaders if we hit a barrier return now for later rescheduling */
   6147          if (barrier_hit && mach->ShaderType == PIPE_SHADER_COMPUTE)
   6148             return 0;
   6149 
   6150 #if DEBUG_EXECUTION
   6151          for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
   6152             if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
   6153                uint j;
   6154 
   6155                memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
   6156                debug_printf("TEMP[%2u] = ", i);
   6157                for (j = 0; j < 4; j++) {
   6158                   if (j > 0) {
   6159                      debug_printf("           ");
   6160                   }
   6161                   debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
   6162                                temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
   6163                                temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
   6164                                temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
   6165                                temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
   6166                }
   6167             }
   6168          }
   6169          if (mach->Outputs) {
   6170             for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
   6171                if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
   6172                   uint j;
   6173 
   6174                   memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
   6175                   debug_printf("OUT[%2u] =  ", i);
   6176                   for (j = 0; j < 4; j++) {
   6177                      if (j > 0) {
   6178                         debug_printf("           ");
   6179                      }
   6180                      debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
   6181                                   outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
   6182                                   outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
   6183                                   outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
   6184                                   outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
   6185                   }
   6186                }
   6187             }
   6188          }
   6189 #endif
   6190       }
   6191    }
   6192 
   6193 #if 0
   6194    /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
   6195    if (mach->ShaderType == PIPE_SHADER_FRAGMENT) {
   6196       /*
   6197        * Scale back depth component.
   6198        */
   6199       for (i = 0; i < 4; i++)
   6200          mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
   6201    }
   6202 #endif
   6203 
   6204    /* Strictly speaking, these assertions aren't really needed but they
   6205     * can potentially catch some bugs in the control flow code.
   6206     */
   6207    assert(mach->CondStackTop == 0);
   6208    assert(mach->LoopStackTop == 0);
   6209    assert(mach->ContStackTop == 0);
   6210    assert(mach->SwitchStackTop == 0);
   6211    assert(mach->BreakStackTop == 0);
   6212    assert(mach->CallStackTop == 0);
   6213 
   6214    return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
   6215 }
   6216